2016-04-05 18:10:51

by Chris Mason

[permalink] [raw]
Subject: [PATCH RFC] select_idle_sibling experiments

Hi everyone,

We're porting the fb kernel up to 4.5, and one of our last few out-of-tree
patches is a hack to try harder to find idle cpus when waking up tasks.
This helps in pretty much every workload we run, mostly because they all
get tuned with a similar setup:

1) find the load where latencies stop being acceptable
2) Run the server at just a little less than that

Usually this means our CPUs are just a little bit idle, and a poor
scheduler decision to place a task on a busy CPU instead of an idle CPU
ends up impacting our p99 latencies.

Mike helped us with this last year, fixing up wake_wide() to improve
things. But we still ended up having to go back to the old hack.

I started with a small-ish program to benchmark wakeup latencies. The
basic idea is a bunch of worker threads who sit around and burn CPU.
Every once and a while they send a message to a message thread.

The message thread records the time he woke up the worker, and the
worker records the delta between that time and the time he actually got
the CPU again. At the end it spits out a latency histogram. The only
thing we record is the wakeup latency, there's no measurement of 'work
done' or any of the normal things you'd expect in a benchmark.

It has knobs for cpu think time, and for how long the messenger thread
waits before replying. Here's how I'm running it with my patch:

./schbench -c 30000 -s 30000 -m 6 -t 24 -r 30
Latency percentiles (usec)
50.0000th: 50
75.0000th: 62
90.0000th: 73
95.0000th: 79
*99.0000th: 99
99.5000th: 761
99.9000th: 10160
Over=0, min=0, max=14659

This translates to cputime of 30ms, sleep time of 30ms, 6 messenger
threads, 24 workers per messenger and a run time of 30 seconds. My box
has two sockets, 24 cores each. Mainline varies a bit, but numbers like
this are typical:

./schbench -c 30000 -s 30000 -m 6 -t 24 -r 30
Latency percentiles (usec)
50.0000th: 50
75.0000th: 63
90.0000th: 76
95.0000th: 85
*99.0000th: 4680
99.5000th: 10192
99.9000th: 10928
Over=0, min=0, max=21816

A high p99 in real application performance will block a new kernel for
us. p99.5 and p99.9 are included just to show how long the tail really
is.

I've inlined schbench.c below and attached as a .gz file just in case
exchange manages to munge it.

Now, on to the patch. I pushed some code around and narrowed the
problem down to select_idle_sibling() We have cores going into and out
of idle fast enough that even this cut our latencies in half:

static int select_idle_sibling(struct task_struct *p, int target)
goto next;

for_each_cpu(i, sched_group_cpus(sg)) {
- if (i == target || !idle_cpu(i))
+ if (!idle_cpu(i))
goto next;
}

IOW, by the time we get down to for_each_cpu(), the idle_cpu() check
done at the top of the function is no longer valid.

I tried a few variations on select_idle_sibling() that preserved the
underlying goal of returning idle cores before idle SMT threads. They
were all horrible in different ways, and none of them were fast.

The patch below just makes select_idle_sibling pick the first idle
thread it can find. When I ran it through production workloads here, it
was faster than the patch we've been carrying around for the last few
years.


diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 56b7d4b..c41baa6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4974,7 +4974,6 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
static int select_idle_sibling(struct task_struct *p, int target)
{
struct sched_domain *sd;
- struct sched_group *sg;
int i = task_cpu(p);

if (idle_cpu(target))
@@ -4990,24 +4989,14 @@ static int select_idle_sibling(struct task_struct *p, int target)
* Otherwise, iterate the domains and find an elegible idle cpu.
*/
sd = rcu_dereference(per_cpu(sd_llc, target));
- for_each_lower_domain(sd) {
- sg = sd->groups;
- do {
- if (!cpumask_intersects(sched_group_cpus(sg),
- tsk_cpus_allowed(p)))
- goto next;
-
- for_each_cpu(i, sched_group_cpus(sg)) {
- if (i == target || !idle_cpu(i))
- goto next;
- }
+ if (!sd)
+ goto done;

- target = cpumask_first_and(sched_group_cpus(sg),
- tsk_cpus_allowed(p));
+ for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
+ if (cpu_active(i) && idle_cpu(i)) {
+ target = i;
goto done;
-next:
- sg = sg->next;
- } while (sg != sd->groups);
+ }
}
done:
return target;

--------------------------------------------

/*
* schbench.c
*
* Copyright (C) 2016 Facebook
* Chris Mason <[email protected]>
*
* GPLv2, portions copied from the kernel and from Jens Axboe's fio
*
* gcc -Wall -O0 -W schbench.c -o schbench -lpthread
*/
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include <getopt.h>
#include <sys/time.h>
#include <time.h>
#include <string.h>
#include <linux/futex.h>
#include <sys/syscall.h>

#define PLAT_BITS 8
#define PLAT_VAL (1 << PLAT_BITS)
#define PLAT_GROUP_NR 19
#define PLAT_NR (PLAT_GROUP_NR * PLAT_VAL)
#define PLAT_LIST_MAX 20

/* -m number of message threads */
static int message_threads = 2;
/* -t number of workers per message thread */
static int worker_threads = 16;
/* -r seconds */
static int runtime = 30;
/* -s usec */
static int sleeptime = 30000;
/* -c usec */
static unsigned long long cputime = 30000;
/* -a, bool */
static int autobench = 0;

/* the latency histogram uses this to pitch outliers */
static unsigned int max_us = 50000;

/* main() sets this to the time when we should all stop doing work */
static struct timeval global_stop;

/* the message threads flip this to true when they decide runtime is up */
static unsigned long stopping = 0;


/*
* one stat struct per thread data, when the workers sleep this records the
* latency between when they are woken up and when they actually get the
* CPU again. The message threads sum up the stats of all the workers and
* then bubble them up to main() for printing
*/
struct stats {
unsigned int plat[PLAT_NR];
unsigned int nr_samples;
unsigned int max;
unsigned int min;
unsigned int over;
};

/* this defines which latency profiles get printed */
#define PLIST_P99 4
static double plist[PLAT_LIST_MAX] = { 50.0, 75.0, 90.0, 95.0, 99.0, 99.5, 99.9 };

enum {
HELP_LONG_OPT = 1,
};

char *option_string = "am:t:s:c:r:";
static struct option long_options[] = {
{"auto", no_argument, 0, 'a'},
{"message-threads", required_argument, 0, 'm'},
{"threads", required_argument, 0, 't'},
{"runtime", required_argument, 0, 'r'},
{"sleeptime", required_argument, 0, 's'},
{"cputime", required_argument, 0, 'c'},
{"help", no_argument, 0, HELP_LONG_OPT},
{0, 0, 0, 0}
};

static void print_usage(void)
{
fprintf(stderr, "schbench usage:\n"
"\t-d (--dispatch-threads): number of message threads (def: 2)\n"
"\t-t (--threads): worker threads per message thread (def: 16)\n"
"\t-r (--runtime): How long to run before exiting (seconds, def: 30)\n"
"\t-s (--sleeptime): Message thread latency (usec, def: 10000\n"
"\t-c (--cputime): How long to think during loop (usec, def: 10000\n"
);
exit(1);
}

static void parse_options(int ac, char **av)
{
int c;

while (1) {
int option_index = 0;

c = getopt_long(ac, av, option_string,
long_options, &option_index);

if (c == -1)
break;

switch(c) {
case 'a':
autobench = 1;
break;
case 's':
sleeptime = atoi(optarg);
break;
case 'c':
cputime = atoi(optarg);
break;
case 'd':
message_threads = atoi(optarg);
break;
case 't':
worker_threads = atoi(optarg);
break;
case 'r':
runtime = atoi(optarg);
break;
case '?':
case HELP_LONG_OPT:
print_usage();
break;
default:
break;
}
}

if (optind < ac) {
fprintf(stderr, "Error Extra arguments '%s'\n", av[optind]);
exit(1);
}
}

void tvsub(struct timeval * tdiff, struct timeval * t1, struct timeval * t0)
{
tdiff->tv_sec = t1->tv_sec - t0->tv_sec;
tdiff->tv_usec = t1->tv_usec - t0->tv_usec;
if (tdiff->tv_usec < 0 && tdiff->tv_sec > 0) {
tdiff->tv_sec--;
tdiff->tv_usec += 1000000;
if (tdiff->tv_usec < 0) {
fprintf(stderr, "lat_fs: tvsub shows test time ran backwards!\n");
exit(1);
}
}

/* time shouldn't go backwards!!! */
if (tdiff->tv_usec < 0 || t1->tv_sec < t0->tv_sec) {
tdiff->tv_sec = 0;
tdiff->tv_usec = 0;
}
}

/*
* returns the difference between start and stop in usecs. Negative values
* are turned into 0
*/
unsigned long long tvdelta(struct timeval *start, struct timeval *stop)
{
struct timeval td;
unsigned long long usecs;

tvsub(&td, stop, start);
usecs = td.tv_sec;
usecs *= 1000000;
usecs += td.tv_usec;
return (usecs);
}

/* mr axboe's magic latency histogram */
static unsigned int plat_val_to_idx(unsigned int val)
{
unsigned int msb, error_bits, base, offset;

/* Find MSB starting from bit 0 */
if (val == 0)
msb = 0;
else
msb = sizeof(val)*8 - __builtin_clz(val) - 1;

/*
* MSB <= (PLAT_BITS-1), cannot be rounded off. Use
* all bits of the sample as index
*/
if (msb <= PLAT_BITS)
return val;

/* Compute the number of error bits to discard*/
error_bits = msb - PLAT_BITS;

/* Compute the number of buckets before the group */
base = (error_bits + 1) << PLAT_BITS;

/*
* Discard the error bits and apply the mask to find the
* index for the buckets in the group
*/
offset = (PLAT_VAL - 1) & (val >> error_bits);

/* Make sure the index does not exceed (array size - 1) */
return (base + offset) < (PLAT_NR - 1) ?
(base + offset) : (PLAT_NR - 1);
}

/*
* Convert the given index of the bucket array to the value
* represented by the bucket
*/
static unsigned int plat_idx_to_val(unsigned int idx)
{
unsigned int error_bits, k, base;

if (idx >= PLAT_NR) {
fprintf(stderr, "idx %u is too large\n", idx);
exit(1);
}

/* MSB <= (PLAT_BITS-1), cannot be rounded off. Use
* all bits of the sample as index */
if (idx < (PLAT_VAL << 1))
return idx;

/* Find the group and compute the minimum value of that group */
error_bits = (idx >> PLAT_BITS) - 1;
base = 1 << (error_bits + PLAT_BITS);

/* Find its bucket number of the group */
k = idx % PLAT_VAL;

/* Return the mean of the range of the bucket */
return base + ((k + 0.5) * (1 << error_bits));
}


static unsigned int calc_percentiles(unsigned int *io_u_plat, unsigned long nr,
unsigned int **output)
{
unsigned long sum = 0;
unsigned int len, i, j = 0;
unsigned int oval_len = 0;
unsigned int *ovals = NULL;
int is_last;

len = 0;
while (len < PLAT_LIST_MAX && plist[len] != 0.0)
len++;

if (!len)
return 0;

/*
* Calculate bucket values, note down max and min values
*/
is_last = 0;
for (i = 0; i < PLAT_NR && !is_last; i++) {
sum += io_u_plat[i];
while (sum >= (plist[j] / 100.0 * nr)) {
if (j == oval_len) {
oval_len += 100;
ovals = realloc(ovals, oval_len * sizeof(unsigned int));
}

ovals[j] = plat_idx_to_val(i);
is_last = (j == len - 1);
if (is_last)
break;

j++;
}
}

*output = ovals;
return len;
}

static int calc_p99(struct stats *s)
{
unsigned int *ovals = NULL;
int ret = 0;
int len;

len = calc_percentiles(s->plat, s->nr_samples, &ovals);
if (len && len > PLIST_P99)
ret = ovals[PLIST_P99];
if (ovals)
free(ovals);
return ret;
}

static void show_latencies(struct stats *s)
{
unsigned int *ovals = NULL;
unsigned int len, i;

len = calc_percentiles(s->plat, s->nr_samples, &ovals);
if (len) {
fprintf(stderr, "Latency percentiles (usec)\n");
for (i = 0; i < len; i++)
fprintf(stderr, "\t%s%2.4fth: %u\n",
i == PLIST_P99 ? "*" : "",
plist[i], ovals[i]);
}

if (ovals)
free(ovals);

fprintf(stderr, "\tOver=%u, min=%u, max=%u\n", s->over, s->min, s->max);
}

/* fold latency info from s into d */
void combine_stats(struct stats *d, struct stats *s)
{
int i;
for (i = 0; i < PLAT_NR; i++)
d->plat[i] += s->plat[i];
d->nr_samples += s->nr_samples;
d->over += s->over;
if (s->max > d->max)
d->max = s->max;
if (s->min < d->min)
d->min = s->min;
}

/* record a latency result into the histogram */
static void add_lat(struct stats *s, unsigned int us)
{
int lat_index = 0;

if (us > s->max)
s->max = us;
if (us < s->min)
s->min = us;

if (us > max_us) {
fprintf(stderr, "latency=%u usec\n", us);
s->over++;
}

lat_index = plat_val_to_idx(us);
__sync_fetch_and_add(&s->plat[lat_index], 1);
__sync_fetch_and_add(&s->nr_samples, 1);
}

/*
* every thread has one of these, it comes out to about 19K thanks to the
* giant stats struct
*/
struct thread_data {
pthread_t tid;
/* ->next is for placing us on the msg_thread's list for waking */
struct thread_data *next;

/* our parent thread and messaging partner */
struct thread_data *msg_thread;

/*
* the msg thread stuffs gtod in here before waking us, so we can
* measure scheduler latency
*/
struct timeval wake_time;

/* keep the futex and the wake_time in the same cacheline */
int futex;

/* mr axboe's magic latency histogram */
struct stats stats;
};

/* we're so fancy we make our own futex wrappers */
#define FUTEX_BLOCKED 0
#define FUTEX_RUNNING 1

static int futex(int *uaddr, int futex_op, int val,
const struct timespec *timeout, int *uaddr2, int val3)
{
return syscall(SYS_futex, uaddr, futex_op, val, timeout, uaddr2, val3);
}

/*
* wakeup a process waiting on a futex, making sure they are really waiting
* first
*/
static void fpost(int *futexp)
{
int s;

if (__sync_bool_compare_and_swap(futexp, FUTEX_BLOCKED,
FUTEX_RUNNING)) {
s = futex(futexp, FUTEX_WAKE, 1, NULL, NULL, 0);
if (s == -1) {
perror("FUTEX_WAKE");
exit(1);
}
}
}

/*
* wait on a futex, with an optional timeout. Make sure to set
* the futex to FUTEX_BLOCKED beforehand.
*
* This will return zero if all went well, or return -ETIMEDOUT if you
* hit the timeout without getting posted
*/
static int fwait(int *futexp, struct timespec *timeout)
{
int s;
while (1) {
/* Is the futex available? */
if (__sync_bool_compare_and_swap(futexp, FUTEX_RUNNING,
FUTEX_BLOCKED)) {
break; /* Yes */
}
/* Futex is not available; wait */
s = futex(futexp, FUTEX_WAIT, FUTEX_BLOCKED, timeout, NULL, 0);
if (s == -1 && errno != EAGAIN) {
if (errno == ETIMEDOUT)
return -ETIMEDOUT;
perror("futex-FUTEX_WAIT");
exit(1);
}
}
return 0;
}

/*
* cmpxchg based list prepend
*/
static void xlist_add(struct thread_data *head, struct thread_data *add)
{
struct thread_data *old;
struct thread_data *ret;

while (1) {
old = head->next;
add->next = old;
ret = __sync_val_compare_and_swap(&head->next, old, add);
if (ret == old)
break;
}
}

/*
* xchg based list splicing. This returns the entire list and
* replaces the head->next with NULL
*/
static struct thread_data *xlist_splice(struct thread_data *head)
{
struct thread_data *old;
struct thread_data *ret;

while (1) {
old = head->next;
ret = __sync_val_compare_and_swap(&head->next, old, NULL);
if (ret == old)
break;
}
return ret;
}

/*
* Wake everyone currently waiting on the message list, filling in their
* thread_data->wake_time with the current time.
*
* It's not exactly the current time, it's really the time at the start of
* the list run. We want to detect when the scheduler is just preempting the
* waker and giving away the rest of its timeslice. So we gtod once at
* the start of the loop and use that for all the threads we wake.
*/
static void xlist_wake_all(struct thread_data *td)
{
struct thread_data *list;
struct thread_data *next;
struct timeval now;

list = xlist_splice(td);
gettimeofday(&now, NULL);
while (list) {
next = list->next;
list->next = NULL;
memcpy(&list->wake_time, &now, sizeof(now));
fpost(&list->futex);
list = next;
}
}

/*
* called by worker threads to send a message and wait for the answer.
* In reality we're just trading one cacheline with the gtod and futex in
* it, but that's good enough. We gtod after waking and use that to
* record scheduler latency.
*/
static void msg_and_wait(struct thread_data *td)
{
struct timeval now;
unsigned long long delta;
struct timespec timeout;

timeout.tv_sec = 0;
timeout.tv_nsec = 5000 * 1000;

/* set ourselves to blocked */
td->futex = FUTEX_BLOCKED;
gettimeofday(&td->wake_time, NULL);

/* add us to the list */
xlist_add(td->msg_thread, td);

fpost(&td->msg_thread->futex);

/*
* don't wait if the main threads are shutting down,
* they will never kick us fpost has a full barrier, so as long
* as the message thread walks his list after setting stopping,
* we shouldn't miss the wakeup
*/
if (!stopping) {
/* if he hasn't already woken us up, wait */
fwait(&td->futex, NULL);
}

gettimeofday(&now, NULL);
delta = tvdelta(&td->wake_time, &now);
if (delta > 0)
add_lat(&td->stats, delta);
}

/*
* once the message thread starts all his children, this is where he
* loops until our runtime is up. Basically this sits around waiting
* for posting by the worker threads, replying to their messages after
* a delay of 'sleeptime' + some jitter.
*/
static void run_msg_thread(struct thread_data *td)
{
struct timeval now;
struct timespec timeout;
unsigned int seed = pthread_self();
int max_jitter = sleeptime / 4;
int jitter;

jitter = rand_r(&seed) % max_jitter;
timeout.tv_sec = 0;
timeout.tv_nsec = (sleeptime + jitter) * 1000;

while (1) {
td->futex = FUTEX_BLOCKED;
xlist_wake_all(td);

gettimeofday(&now, NULL);
if (now.tv_sec > global_stop.tv_sec) {
stopping = 1;
__sync_synchronize();
xlist_wake_all(td);
break;
}
fwait(&td->futex, &timeout);

/*
* messages shouldn't be instant, sleep a little to make them
* wait
*/
jitter = rand_r(&seed) % max_jitter;
usleep(sleeptime + jitter);
}
}

#define nop __asm__ __volatile__("rep;nop": : :"memory")

static void usec_spin(unsigned long spin_time)
{
struct timeval now;
struct timeval start;
unsigned long long delta;

gettimeofday(&start, NULL);
while (1) {
gettimeofday(&now, NULL);
delta = tvdelta(&start, &now);
if (delta > spin_time)
return;
nop;
}
}

/*
* the worker thread is pretty simple, it just does a single spin and
* then waits on a message from the message thread
*/
void *worker_thread(void *arg)
{
struct thread_data *td = arg;

while(1) {
if (stopping)
break;

usec_spin(cputime);
msg_and_wait(td);
}
return NULL;
}

/*
* the message thread starts his own gaggle of workers and then sits around
* replying when they post him. He collects latency stats as all the threads
* exit
*/
void *message_thread(void *arg)
{
struct thread_data *td = arg;
struct thread_data *worker_threads_mem = NULL;
int i;
int ret;

worker_threads_mem = calloc(worker_threads, sizeof(struct thread_data));

if (!worker_threads_mem) {
perror("unable to allocate ram");
pthread_exit((void *)-ENOMEM);
}

for (i = 0; i < worker_threads; i++) {
pthread_t tid;

worker_threads_mem[i].msg_thread = td;
ret = pthread_create(&tid, NULL, worker_thread,
worker_threads_mem + i);
if (ret) {
fprintf(stderr, "error %d from pthread_create\n", ret);
exit(1);
}
worker_threads_mem[i].tid = tid;
}

run_msg_thread(td);

for (i = 0; i < worker_threads; i++) {
pthread_join(worker_threads_mem[i].tid, NULL);
combine_stats(&td->stats, &worker_threads_mem[i].stats);
}
free(worker_threads_mem);

return NULL;
}

int main(int ac, char **av)
{
int i;
int ret;
struct thread_data *message_threads_mem = NULL;
struct stats stats;

parse_options(ac, av);
again:
stopping = 0;
memset(&stats, 0, sizeof(stats));

message_threads_mem = calloc(message_threads,
sizeof(struct thread_data));


if (!message_threads_mem) {
perror("unable to allocate ram");
exit(1);
}
gettimeofday(&global_stop, NULL);
global_stop.tv_sec += runtime;

/* start our message threads, each one starts its own workers */
for (i = 0; i < message_threads; i++) {
pthread_t tid;
ret = pthread_create(&tid, NULL, message_thread,
message_threads_mem + i);
if (ret) {
fprintf(stderr, "error %d from pthread_create\n", ret);
exit(1);
}
message_threads_mem[i].tid = tid;
}
for (i = 0; i < message_threads; i++) {
pthread_join(message_threads_mem[i].tid, NULL);
combine_stats(&stats, &message_threads_mem[i].stats);
}

free(message_threads_mem);

/*
* in auto bench mode, keep adding workers until our latencies get
* horrible
*/
if (autobench) {
int p99 = calc_p99(&stats);
fprintf(stderr, "cputime %Lu threads %d p99 %d\n",
cputime, worker_threads, p99);
if (p99 < 2000) {
worker_threads++;
goto again;
}
}

show_latencies(&stats);

return 0;
}


Attachments:
(No filename) (20.55 kB)
schbench.c.gz (5.39 kB)
Download all attachments

2016-04-05 18:43:13

by Bastien Philbert

[permalink] [raw]
Subject: Re: [PATCH RFC] select_idle_sibling experiments

On Tue, Apr 5, 2016 at 2:08 PM, Chris Mason <[email protected]> wrote:
> Hi everyone,
>
> We're porting the fb kernel up to 4.5, and one of our last few out-of-tree
> patches is a hack to try harder to find idle cpus when waking up tasks.
> This helps in pretty much every workload we run, mostly because they all
> get tuned with a similar setup:
>
> 1) find the load where latencies stop being acceptable
> 2) Run the server at just a little less than that
>
> Usually this means our CPUs are just a little bit idle, and a poor
> scheduler decision to place a task on a busy CPU instead of an idle CPU
> ends up impacting our p99 latencies.
>
> Mike helped us with this last year, fixing up wake_wide() to improve
> things. But we still ended up having to go back to the old hack.
>
> I started with a small-ish program to benchmark wakeup latencies. The
> basic idea is a bunch of worker threads who sit around and burn CPU.
> Every once and a while they send a message to a message thread.
>
> The message thread records the time he woke up the worker, and the
> worker records the delta between that time and the time he actually got
> the CPU again. At the end it spits out a latency histogram. The only
> thing we record is the wakeup latency, there's no measurement of 'work
> done' or any of the normal things you'd expect in a benchmark.
>
> It has knobs for cpu think time, and for how long the messenger thread
> waits before replying. Here's how I'm running it with my patch:
>
> ./schbench -c 30000 -s 30000 -m 6 -t 24 -r 30
> Latency percentiles (usec)
> 50.0000th: 50
> 75.0000th: 62
> 90.0000th: 73
> 95.0000th: 79
> *99.0000th: 99
> 99.5000th: 761
> 99.9000th: 10160
> Over=0, min=0, max=14659
>
> This translates to cputime of 30ms, sleep time of 30ms, 6 messenger
> threads, 24 workers per messenger and a run time of 30 seconds. My box
> has two sockets, 24 cores each. Mainline varies a bit, but numbers like
> this are typical:
>
> ./schbench -c 30000 -s 30000 -m 6 -t 24 -r 30
> Latency percentiles (usec)
> 50.0000th: 50
> 75.0000th: 63
> 90.0000th: 76
> 95.0000th: 85
> *99.0000th: 4680
> 99.5000th: 10192
> 99.9000th: 10928
> Over=0, min=0, max=21816
>
> A high p99 in real application performance will block a new kernel for
> us. p99.5 and p99.9 are included just to show how long the tail really
> is.
>
> I've inlined schbench.c below and attached as a .gz file just in case
> exchange manages to munge it.
>
> Now, on to the patch. I pushed some code around and narrowed the
> problem down to select_idle_sibling() We have cores going into and out
> of idle fast enough that even this cut our latencies in half:
>
> static int select_idle_sibling(struct task_struct *p, int target)
> goto next;
>
> for_each_cpu(i, sched_group_cpus(sg)) {
> - if (i == target || !idle_cpu(i))
> + if (!idle_cpu(i))
> goto next;
> }
>
> IOW, by the time we get down to for_each_cpu(), the idle_cpu() check
> done at the top of the function is no longer valid.
>
> I tried a few variations on select_idle_sibling() that preserved the
> underlying goal of returning idle cores before idle SMT threads. They
> were all horrible in different ways, and none of them were fast.
>
> The patch below just makes select_idle_sibling pick the first idle
> thread it can find. When I ran it through production workloads here, it
> was faster than the patch we've been carrying around for the last few
> years.
>
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 56b7d4b..c41baa6 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4974,7 +4974,6 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
> static int select_idle_sibling(struct task_struct *p, int target)
> {
> struct sched_domain *sd;
> - struct sched_group *sg;
> int i = task_cpu(p);
>
> if (idle_cpu(target))
> @@ -4990,24 +4989,14 @@ static int select_idle_sibling(struct task_struct *p, int target)
> * Otherwise, iterate the domains and find an elegible idle cpu.
> */
> sd = rcu_dereference(per_cpu(sd_llc, target));
> - for_each_lower_domain(sd) {
> - sg = sd->groups;
> - do {
> - if (!cpumask_intersects(sched_group_cpus(sg),
> - tsk_cpus_allowed(p)))
> - goto next;
> -
> - for_each_cpu(i, sched_group_cpus(sg)) {
> - if (i == target || !idle_cpu(i))
> - goto next;
> - }
> + if (!sd)
> + goto done;
>
> - target = cpumask_first_and(sched_group_cpus(sg),
> - tsk_cpus_allowed(p));
> + for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
> + if (cpu_active(i) && idle_cpu(i)) {
> + target = i;
> goto done;
> -next:
> - sg = sg->next;
> - } while (sg != sd->groups);
> + }
> }
> done:
> return target;
>
> --------------------------------------------
>
Here is my concern, do you test this on standard scheduler workloads
or was this just written for Facebook's
internal workloads. I am going to test this later because frankly this
may cause a regression on my system
which has only 4 cores so a idle CPU is probably less common for a
small amount of time. I am wondering
however if Ingo has any complains before I test this to see if it
causes a regression or a bug on my system.
Ingo do you have any thoughts on this or would you like me to just test this?
Bastien
> /*
> * schbench.c
> *
> * Copyright (C) 2016 Facebook
> * Chris Mason <[email protected]>
> *
> * GPLv2, portions copied from the kernel and from Jens Axboe's fio
> *
> * gcc -Wall -O0 -W schbench.c -o schbench -lpthread
> */
> #include <stdio.h>
> #include <stdlib.h>
> #include <pthread.h>
> #include <fcntl.h>
> #include <unistd.h>
> #include <errno.h>
> #include <getopt.h>
> #include <sys/time.h>
> #include <time.h>
> #include <string.h>
> #include <linux/futex.h>
> #include <sys/syscall.h>
>
> #define PLAT_BITS 8
> #define PLAT_VAL (1 << PLAT_BITS)
> #define PLAT_GROUP_NR 19
> #define PLAT_NR (PLAT_GROUP_NR * PLAT_VAL)
> #define PLAT_LIST_MAX 20
>
> /* -m number of message threads */
> static int message_threads = 2;
> /* -t number of workers per message thread */
> static int worker_threads = 16;
> /* -r seconds */
> static int runtime = 30;
> /* -s usec */
> static int sleeptime = 30000;
> /* -c usec */
> static unsigned long long cputime = 30000;
> /* -a, bool */
> static int autobench = 0;
>
> /* the latency histogram uses this to pitch outliers */
> static unsigned int max_us = 50000;
>
> /* main() sets this to the time when we should all stop doing work */
> static struct timeval global_stop;
>
> /* the message threads flip this to true when they decide runtime is up */
> static unsigned long stopping = 0;
>
>
> /*
> * one stat struct per thread data, when the workers sleep this records the
> * latency between when they are woken up and when they actually get the
> * CPU again. The message threads sum up the stats of all the workers and
> * then bubble them up to main() for printing
> */
> struct stats {
> unsigned int plat[PLAT_NR];
> unsigned int nr_samples;
> unsigned int max;
> unsigned int min;
> unsigned int over;
> };
>
> /* this defines which latency profiles get printed */
> #define PLIST_P99 4
> static double plist[PLAT_LIST_MAX] = { 50.0, 75.0, 90.0, 95.0, 99.0, 99.5, 99.9 };
>
> enum {
> HELP_LONG_OPT = 1,
> };
>
> char *option_string = "am:t:s:c:r:";
> static struct option long_options[] = {
> {"auto", no_argument, 0, 'a'},
> {"message-threads", required_argument, 0, 'm'},
> {"threads", required_argument, 0, 't'},
> {"runtime", required_argument, 0, 'r'},
> {"sleeptime", required_argument, 0, 's'},
> {"cputime", required_argument, 0, 'c'},
> {"help", no_argument, 0, HELP_LONG_OPT},
> {0, 0, 0, 0}
> };
>
> static void print_usage(void)
> {
> fprintf(stderr, "schbench usage:\n"
> "\t-d (--dispatch-threads): number of message threads (def: 2)\n"
> "\t-t (--threads): worker threads per message thread (def: 16)\n"
> "\t-r (--runtime): How long to run before exiting (seconds, def: 30)\n"
> "\t-s (--sleeptime): Message thread latency (usec, def: 10000\n"
> "\t-c (--cputime): How long to think during loop (usec, def: 10000\n"
> );
> exit(1);
> }
>
> static void parse_options(int ac, char **av)
> {
> int c;
>
> while (1) {
> int option_index = 0;
>
> c = getopt_long(ac, av, option_string,
> long_options, &option_index);
>
> if (c == -1)
> break;
>
> switch(c) {
> case 'a':
> autobench = 1;
> break;
> case 's':
> sleeptime = atoi(optarg);
> break;
> case 'c':
> cputime = atoi(optarg);
> break;
> case 'd':
> message_threads = atoi(optarg);
> break;
> case 't':
> worker_threads = atoi(optarg);
> break;
> case 'r':
> runtime = atoi(optarg);
> break;
> case '?':
> case HELP_LONG_OPT:
> print_usage();
> break;
> default:
> break;
> }
> }
>
> if (optind < ac) {
> fprintf(stderr, "Error Extra arguments '%s'\n", av[optind]);
> exit(1);
> }
> }
>
> void tvsub(struct timeval * tdiff, struct timeval * t1, struct timeval * t0)
> {
> tdiff->tv_sec = t1->tv_sec - t0->tv_sec;
> tdiff->tv_usec = t1->tv_usec - t0->tv_usec;
> if (tdiff->tv_usec < 0 && tdiff->tv_sec > 0) {
> tdiff->tv_sec--;
> tdiff->tv_usec += 1000000;
> if (tdiff->tv_usec < 0) {
> fprintf(stderr, "lat_fs: tvsub shows test time ran backwards!\n");
> exit(1);
> }
> }
>
> /* time shouldn't go backwards!!! */
> if (tdiff->tv_usec < 0 || t1->tv_sec < t0->tv_sec) {
> tdiff->tv_sec = 0;
> tdiff->tv_usec = 0;
> }
> }
>
> /*
> * returns the difference between start and stop in usecs. Negative values
> * are turned into 0
> */
> unsigned long long tvdelta(struct timeval *start, struct timeval *stop)
> {
> struct timeval td;
> unsigned long long usecs;
>
> tvsub(&td, stop, start);
> usecs = td.tv_sec;
> usecs *= 1000000;
> usecs += td.tv_usec;
> return (usecs);
> }
>
> /* mr axboe's magic latency histogram */
> static unsigned int plat_val_to_idx(unsigned int val)
> {
> unsigned int msb, error_bits, base, offset;
>
> /* Find MSB starting from bit 0 */
> if (val == 0)
> msb = 0;
> else
> msb = sizeof(val)*8 - __builtin_clz(val) - 1;
>
> /*
> * MSB <= (PLAT_BITS-1), cannot be rounded off. Use
> * all bits of the sample as index
> */
> if (msb <= PLAT_BITS)
> return val;
>
> /* Compute the number of error bits to discard*/
> error_bits = msb - PLAT_BITS;
>
> /* Compute the number of buckets before the group */
> base = (error_bits + 1) << PLAT_BITS;
>
> /*
> * Discard the error bits and apply the mask to find the
> * index for the buckets in the group
> */
> offset = (PLAT_VAL - 1) & (val >> error_bits);
>
> /* Make sure the index does not exceed (array size - 1) */
> return (base + offset) < (PLAT_NR - 1) ?
> (base + offset) : (PLAT_NR - 1);
> }
>
> /*
> * Convert the given index of the bucket array to the value
> * represented by the bucket
> */
> static unsigned int plat_idx_to_val(unsigned int idx)
> {
> unsigned int error_bits, k, base;
>
> if (idx >= PLAT_NR) {
> fprintf(stderr, "idx %u is too large\n", idx);
> exit(1);
> }
>
> /* MSB <= (PLAT_BITS-1), cannot be rounded off. Use
> * all bits of the sample as index */
> if (idx < (PLAT_VAL << 1))
> return idx;
>
> /* Find the group and compute the minimum value of that group */
> error_bits = (idx >> PLAT_BITS) - 1;
> base = 1 << (error_bits + PLAT_BITS);
>
> /* Find its bucket number of the group */
> k = idx % PLAT_VAL;
>
> /* Return the mean of the range of the bucket */
> return base + ((k + 0.5) * (1 << error_bits));
> }
>
>
> static unsigned int calc_percentiles(unsigned int *io_u_plat, unsigned long nr,
> unsigned int **output)
> {
> unsigned long sum = 0;
> unsigned int len, i, j = 0;
> unsigned int oval_len = 0;
> unsigned int *ovals = NULL;
> int is_last;
>
> len = 0;
> while (len < PLAT_LIST_MAX && plist[len] != 0.0)
> len++;
>
> if (!len)
> return 0;
>
> /*
> * Calculate bucket values, note down max and min values
> */
> is_last = 0;
> for (i = 0; i < PLAT_NR && !is_last; i++) {
> sum += io_u_plat[i];
> while (sum >= (plist[j] / 100.0 * nr)) {
> if (j == oval_len) {
> oval_len += 100;
> ovals = realloc(ovals, oval_len * sizeof(unsigned int));
> }
>
> ovals[j] = plat_idx_to_val(i);
> is_last = (j == len - 1);
> if (is_last)
> break;
>
> j++;
> }
> }
>
> *output = ovals;
> return len;
> }
>
> static int calc_p99(struct stats *s)
> {
> unsigned int *ovals = NULL;
> int ret = 0;
> int len;
>
> len = calc_percentiles(s->plat, s->nr_samples, &ovals);
> if (len && len > PLIST_P99)
> ret = ovals[PLIST_P99];
> if (ovals)
> free(ovals);
> return ret;
> }
>
> static void show_latencies(struct stats *s)
> {
> unsigned int *ovals = NULL;
> unsigned int len, i;
>
> len = calc_percentiles(s->plat, s->nr_samples, &ovals);
> if (len) {
> fprintf(stderr, "Latency percentiles (usec)\n");
> for (i = 0; i < len; i++)
> fprintf(stderr, "\t%s%2.4fth: %u\n",
> i == PLIST_P99 ? "*" : "",
> plist[i], ovals[i]);
> }
>
> if (ovals)
> free(ovals);
>
> fprintf(stderr, "\tOver=%u, min=%u, max=%u\n", s->over, s->min, s->max);
> }
>
> /* fold latency info from s into d */
> void combine_stats(struct stats *d, struct stats *s)
> {
> int i;
> for (i = 0; i < PLAT_NR; i++)
> d->plat[i] += s->plat[i];
> d->nr_samples += s->nr_samples;
> d->over += s->over;
> if (s->max > d->max)
> d->max = s->max;
> if (s->min < d->min)
> d->min = s->min;
> }
>
> /* record a latency result into the histogram */
> static void add_lat(struct stats *s, unsigned int us)
> {
> int lat_index = 0;
>
> if (us > s->max)
> s->max = us;
> if (us < s->min)
> s->min = us;
>
> if (us > max_us) {
> fprintf(stderr, "latency=%u usec\n", us);
> s->over++;
> }
>
> lat_index = plat_val_to_idx(us);
> __sync_fetch_and_add(&s->plat[lat_index], 1);
> __sync_fetch_and_add(&s->nr_samples, 1);
> }
>
> /*
> * every thread has one of these, it comes out to about 19K thanks to the
> * giant stats struct
> */
> struct thread_data {
> pthread_t tid;
> /* ->next is for placing us on the msg_thread's list for waking */
> struct thread_data *next;
>
> /* our parent thread and messaging partner */
> struct thread_data *msg_thread;
>
> /*
> * the msg thread stuffs gtod in here before waking us, so we can
> * measure scheduler latency
> */
> struct timeval wake_time;
>
> /* keep the futex and the wake_time in the same cacheline */
> int futex;
>
> /* mr axboe's magic latency histogram */
> struct stats stats;
> };
>
> /* we're so fancy we make our own futex wrappers */
> #define FUTEX_BLOCKED 0
> #define FUTEX_RUNNING 1
>
> static int futex(int *uaddr, int futex_op, int val,
> const struct timespec *timeout, int *uaddr2, int val3)
> {
> return syscall(SYS_futex, uaddr, futex_op, val, timeout, uaddr2, val3);
> }
>
> /*
> * wakeup a process waiting on a futex, making sure they are really waiting
> * first
> */
> static void fpost(int *futexp)
> {
> int s;
>
> if (__sync_bool_compare_and_swap(futexp, FUTEX_BLOCKED,
> FUTEX_RUNNING)) {
> s = futex(futexp, FUTEX_WAKE, 1, NULL, NULL, 0);
> if (s == -1) {
> perror("FUTEX_WAKE");
> exit(1);
> }
> }
> }
>
> /*
> * wait on a futex, with an optional timeout. Make sure to set
> * the futex to FUTEX_BLOCKED beforehand.
> *
> * This will return zero if all went well, or return -ETIMEDOUT if you
> * hit the timeout without getting posted
> */
> static int fwait(int *futexp, struct timespec *timeout)
> {
> int s;
> while (1) {
> /* Is the futex available? */
> if (__sync_bool_compare_and_swap(futexp, FUTEX_RUNNING,
> FUTEX_BLOCKED)) {
> break; /* Yes */
> }
> /* Futex is not available; wait */
> s = futex(futexp, FUTEX_WAIT, FUTEX_BLOCKED, timeout, NULL, 0);
> if (s == -1 && errno != EAGAIN) {
> if (errno == ETIMEDOUT)
> return -ETIMEDOUT;
> perror("futex-FUTEX_WAIT");
> exit(1);
> }
> }
> return 0;
> }
>
> /*
> * cmpxchg based list prepend
> */
> static void xlist_add(struct thread_data *head, struct thread_data *add)
> {
> struct thread_data *old;
> struct thread_data *ret;
>
> while (1) {
> old = head->next;
> add->next = old;
> ret = __sync_val_compare_and_swap(&head->next, old, add);
> if (ret == old)
> break;
> }
> }
>
> /*
> * xchg based list splicing. This returns the entire list and
> * replaces the head->next with NULL
> */
> static struct thread_data *xlist_splice(struct thread_data *head)
> {
> struct thread_data *old;
> struct thread_data *ret;
>
> while (1) {
> old = head->next;
> ret = __sync_val_compare_and_swap(&head->next, old, NULL);
> if (ret == old)
> break;
> }
> return ret;
> }
>
> /*
> * Wake everyone currently waiting on the message list, filling in their
> * thread_data->wake_time with the current time.
> *
> * It's not exactly the current time, it's really the time at the start of
> * the list run. We want to detect when the scheduler is just preempting the
> * waker and giving away the rest of its timeslice. So we gtod once at
> * the start of the loop and use that for all the threads we wake.
> */
> static void xlist_wake_all(struct thread_data *td)
> {
> struct thread_data *list;
> struct thread_data *next;
> struct timeval now;
>
> list = xlist_splice(td);
> gettimeofday(&now, NULL);
> while (list) {
> next = list->next;
> list->next = NULL;
> memcpy(&list->wake_time, &now, sizeof(now));
> fpost(&list->futex);
> list = next;
> }
> }
>
> /*
> * called by worker threads to send a message and wait for the answer.
> * In reality we're just trading one cacheline with the gtod and futex in
> * it, but that's good enough. We gtod after waking and use that to
> * record scheduler latency.
> */
> static void msg_and_wait(struct thread_data *td)
> {
> struct timeval now;
> unsigned long long delta;
> struct timespec timeout;
>
> timeout.tv_sec = 0;
> timeout.tv_nsec = 5000 * 1000;
>
> /* set ourselves to blocked */
> td->futex = FUTEX_BLOCKED;
> gettimeofday(&td->wake_time, NULL);
>
> /* add us to the list */
> xlist_add(td->msg_thread, td);
>
> fpost(&td->msg_thread->futex);
>
> /*
> * don't wait if the main threads are shutting down,
> * they will never kick us fpost has a full barrier, so as long
> * as the message thread walks his list after setting stopping,
> * we shouldn't miss the wakeup
> */
> if (!stopping) {
> /* if he hasn't already woken us up, wait */
> fwait(&td->futex, NULL);
> }
>
> gettimeofday(&now, NULL);
> delta = tvdelta(&td->wake_time, &now);
> if (delta > 0)
> add_lat(&td->stats, delta);
> }
>
> /*
> * once the message thread starts all his children, this is where he
> * loops until our runtime is up. Basically this sits around waiting
> * for posting by the worker threads, replying to their messages after
> * a delay of 'sleeptime' + some jitter.
> */
> static void run_msg_thread(struct thread_data *td)
> {
> struct timeval now;
> struct timespec timeout;
> unsigned int seed = pthread_self();
> int max_jitter = sleeptime / 4;
> int jitter;
>
> jitter = rand_r(&seed) % max_jitter;
> timeout.tv_sec = 0;
> timeout.tv_nsec = (sleeptime + jitter) * 1000;
>
> while (1) {
> td->futex = FUTEX_BLOCKED;
> xlist_wake_all(td);
>
> gettimeofday(&now, NULL);
> if (now.tv_sec > global_stop.tv_sec) {
> stopping = 1;
> __sync_synchronize();
> xlist_wake_all(td);
> break;
> }
> fwait(&td->futex, &timeout);
>
> /*
> * messages shouldn't be instant, sleep a little to make them
> * wait
> */
> jitter = rand_r(&seed) % max_jitter;
> usleep(sleeptime + jitter);
> }
> }
>
> #define nop __asm__ __volatile__("rep;nop": : :"memory")
>
> static void usec_spin(unsigned long spin_time)
> {
> struct timeval now;
> struct timeval start;
> unsigned long long delta;
>
> gettimeofday(&start, NULL);
> while (1) {
> gettimeofday(&now, NULL);
> delta = tvdelta(&start, &now);
> if (delta > spin_time)
> return;
> nop;
> }
> }
>
> /*
> * the worker thread is pretty simple, it just does a single spin and
> * then waits on a message from the message thread
> */
> void *worker_thread(void *arg)
> {
> struct thread_data *td = arg;
>
> while(1) {
> if (stopping)
> break;
>
> usec_spin(cputime);
> msg_and_wait(td);
> }
> return NULL;
> }
>
> /*
> * the message thread starts his own gaggle of workers and then sits around
> * replying when they post him. He collects latency stats as all the threads
> * exit
> */
> void *message_thread(void *arg)
> {
> struct thread_data *td = arg;
> struct thread_data *worker_threads_mem = NULL;
> int i;
> int ret;
>
> worker_threads_mem = calloc(worker_threads, sizeof(struct thread_data));
>
> if (!worker_threads_mem) {
> perror("unable to allocate ram");
> pthread_exit((void *)-ENOMEM);
> }
>
> for (i = 0; i < worker_threads; i++) {
> pthread_t tid;
>
> worker_threads_mem[i].msg_thread = td;
> ret = pthread_create(&tid, NULL, worker_thread,
> worker_threads_mem + i);
> if (ret) {
> fprintf(stderr, "error %d from pthread_create\n", ret);
> exit(1);
> }
> worker_threads_mem[i].tid = tid;
> }
>
> run_msg_thread(td);
>
> for (i = 0; i < worker_threads; i++) {
> pthread_join(worker_threads_mem[i].tid, NULL);
> combine_stats(&td->stats, &worker_threads_mem[i].stats);
> }
> free(worker_threads_mem);
>
> return NULL;
> }
>
> int main(int ac, char **av)
> {
> int i;
> int ret;
> struct thread_data *message_threads_mem = NULL;
> struct stats stats;
>
> parse_options(ac, av);
> again:
> stopping = 0;
> memset(&stats, 0, sizeof(stats));
>
> message_threads_mem = calloc(message_threads,
> sizeof(struct thread_data));
>
>
> if (!message_threads_mem) {
> perror("unable to allocate ram");
> exit(1);
> }
> gettimeofday(&global_stop, NULL);
> global_stop.tv_sec += runtime;
>
> /* start our message threads, each one starts its own workers */
> for (i = 0; i < message_threads; i++) {
> pthread_t tid;
> ret = pthread_create(&tid, NULL, message_thread,
> message_threads_mem + i);
> if (ret) {
> fprintf(stderr, "error %d from pthread_create\n", ret);
> exit(1);
> }
> message_threads_mem[i].tid = tid;
> }
> for (i = 0; i < message_threads; i++) {
> pthread_join(message_threads_mem[i].tid, NULL);
> combine_stats(&stats, &message_threads_mem[i].stats);
> }
>
> free(message_threads_mem);
>
> /*
> * in auto bench mode, keep adding workers until our latencies get
> * horrible
> */
> if (autobench) {
> int p99 = calc_p99(&stats);
> fprintf(stderr, "cputime %Lu threads %d p99 %d\n",
> cputime, worker_threads, p99);
> if (p99 < 2000) {
> worker_threads++;
> goto again;
> }
> }
>
> show_latencies(&stats);
>
> return 0;
> }

2016-04-05 19:29:04

by Chris Mason

[permalink] [raw]
Subject: Re: [PATCH RFC] select_idle_sibling experiments

On Tue, Apr 05, 2016 at 02:43:09PM -0400, Bastien Bastien Philbert wrote:
> On Tue, Apr 5, 2016 at 2:08 PM, Chris Mason <[email protected]> wrote:

[ ... ]

> >
> > I tried a few variations on select_idle_sibling() that preserved the
> > underlying goal of returning idle cores before idle SMT threads. They
> > were all horrible in different ways, and none of them were fast.
> >
> > The patch below just makes select_idle_sibling pick the first idle
> > thread it can find. When I ran it through production workloads here, it
> > was faster than the patch we've been carrying around for the last few
> > years.

[ ... ]

> >
> Here is my concern, do you test this on standard scheduler workloads
> or was this just written for Facebook's internal workloads. I am going
> to test this later because frankly this may cause a regression on my
> system which has only 4 cores so a idle CPU is probably less common
> for a small amount of time. I am wondering however if Ingo has any
> complains before I test this to see if it causes a regression or a bug
> on my system. Ingo do you have any thoughts on this or would you like
> me to just test this? Bastien

Pretty much every commit to select_idle_sibling over the last few years
was somehow trying to preserve or improve the select-idle-cores-first
functionality I just ripped out. So, it's safe to assume it'll break
something ;)

-chris

2016-04-05 20:03:10

by Matt Fleming

[permalink] [raw]
Subject: Re: [PATCH RFC] select_idle_sibling experiments

On Tue, 05 Apr, at 02:08:22PM, Chris Mason wrote:
>
> I started with a small-ish program to benchmark wakeup latencies. The
> basic idea is a bunch of worker threads who sit around and burn CPU.
> Every once and a while they send a message to a message thread.

This reminds me of something I've been looking at recently; a similar
workload in Mel's mmtests based on pgbench with 1-client that also has
this problem of cpu_idle() being false at an inconvenient time in
select_idle_sibling(), so we move the task off the cpu and the cpu
then immediately goes idle.

This leads to tasks bouncing around the socket as we search for idle
cpus.

> It has knobs for cpu think time, and for how long the messenger thread
> waits before replying. Here's how I'm running it with my patch:

[...]

Cool, I'll go have a play with this.

> Now, on to the patch. I pushed some code around and narrowed the
> problem down to select_idle_sibling() We have cores going into and out
> of idle fast enough that even this cut our latencies in half:
>
> static int select_idle_sibling(struct task_struct *p, int target)
> goto next;
>
> for_each_cpu(i, sched_group_cpus(sg)) {
> - if (i == target || !idle_cpu(i))
> + if (!idle_cpu(i))
> goto next;
> }
>
> IOW, by the time we get down to for_each_cpu(), the idle_cpu() check
> done at the top of the function is no longer valid.

Yeah. The problem is that because we're racing with the cpu going in
and out of idle, and since you're exploiting that race condition, this
is highly tuned to your specific workload.

Which is a roundabout way of saying, this is probably going to
negatively impact other workloads.

> I tried a few variations on select_idle_sibling() that preserved the
> underlying goal of returning idle cores before idle SMT threads. They
> were all horrible in different ways, and none of them were fast.

I toyed with ignoring cpu_idle() in select_idle_sibling() for my
workload. That actually was faster ;)

> The patch below just makes select_idle_sibling pick the first idle
> thread it can find. When I ran it through production workloads here, it
> was faster than the patch we've been carrying around for the last few
> years.

It would be really nice if we had a lightweight way to gauge the
"idleness" of a cpu, and whether we expect it to be idle again soon.

Failing that, could we just force the task onto 'target' when it makes
sense and skip the idle search (and the race) altogether?

2016-04-05 21:06:11

by Bastien Philbert

[permalink] [raw]
Subject: Re: [PATCH RFC] select_idle_sibling experiments



On 2016-04-05 04:03 PM, Matt Fleming wrote:
> On Tue, 05 Apr, at 02:08:22PM, Chris Mason wrote:
>>
>> I started with a small-ish program to benchmark wakeup latencies. The
>> basic idea is a bunch of worker threads who sit around and burn CPU.
>> Every once and a while they send a message to a message thread.
>
> This reminds me of something I've been looking at recently; a similar
> workload in Mel's mmtests based on pgbench with 1-client that also has
> this problem of cpu_idle() being false at an inconvenient time in
> select_idle_sibling(), so we move the task off the cpu and the cpu
> then immediately goes idle.
>
> This leads to tasks bouncing around the socket as we search for idle
> cpus.
>
>> It has knobs for cpu think time, and for how long the messenger thread
>> waits before replying. Here's how I'm running it with my patch:
>
> [...]
>
> Cool, I'll go have a play with this.
>
>> Now, on to the patch. I pushed some code around and narrowed the
>> problem down to select_idle_sibling() We have cores going into and out
>> of idle fast enough that even this cut our latencies in half:
>>
>> static int select_idle_sibling(struct task_struct *p, int target)
>> goto next;
>>
>> for_each_cpu(i, sched_group_cpus(sg)) {
>> - if (i == target || !idle_cpu(i))
>> + if (!idle_cpu(i))
>> goto next;
>> }
>>
>> IOW, by the time we get down to for_each_cpu(), the idle_cpu() check
>> done at the top of the function is no longer valid.
>
> Yeah. The problem is that because we're racing with the cpu going in
> and out of idle, and since you're exploiting that race condition, this
> is highly tuned to your specific workload.
>
> Which is a roundabout way of saying, this is probably going to
> negatively impact other workloads.
>
>> I tried a few variations on select_idle_sibling() that preserved the
>> underlying goal of returning idle cores before idle SMT threads. They
>> were all horrible in different ways, and none of them were fast.
>
> I toyed with ignoring cpu_idle() in select_idle_sibling() for my
> workload. That actually was faster ;)
>
>> The patch below just makes select_idle_sibling pick the first idle
>> thread it can find. When I ran it through production workloads here, it
>> was faster than the patch we've been carrying around for the last few
>> years.
>
> It would be really nice if we had a lightweight way to gauge the
> "idleness" of a cpu, and whether we expect it to be idle again soon.
>
The best way to do this is either embed it in a already used structure to
allow us to check it quickly. Otherwise I am curious if writing a marco
may prove useful for this. Seems that idleness checking needs to accounted
for when scheduling, in order to make this lightweight enough to avoid using
it during a context switch, the challenge however is to make the reference
counting lightweight enough to out weight it being done during current scheduling
functions.
> Failing that, could we just force the task onto 'target' when it makes
> sense and skip the idle search (and the race) altogether?
>
Doesn't this possibly cause a context switch or even a extensive move to another
CPU instruction(s) on certain architectures. Seems we need to add reference counting
or tracking of idle CPUS somewhere.
Bastien

2016-04-06 00:44:46

by Chris Mason

[permalink] [raw]
Subject: Re: [PATCH RFC] select_idle_sibling experiments

On Tue, Apr 05, 2016 at 09:03:02PM +0100, Matt Fleming wrote:
> On Tue, 05 Apr, at 02:08:22PM, Chris Mason wrote:
> >
> > I started with a small-ish program to benchmark wakeup latencies. The
> > basic idea is a bunch of worker threads who sit around and burn CPU.
> > Every once and a while they send a message to a message thread.
>
> This reminds me of something I've been looking at recently; a similar
> workload in Mel's mmtests based on pgbench with 1-client that also has
> this problem of cpu_idle() being false at an inconvenient time in
> select_idle_sibling(), so we move the task off the cpu and the cpu
> then immediately goes idle.
>
> This leads to tasks bouncing around the socket as we search for idle
> cpus.

It might be worth making a way to claim the idle cpu. If there are lots
of them, we'll fan out properly instead of piling up into the first one.
If there are very few of them, we'll figure it out much faster.

>
> > It has knobs for cpu think time, and for how long the messenger thread
> > waits before replying. Here's how I'm running it with my patch:
>
> [...]
>
> Cool, I'll go have a play with this.

I'm more than open to ways to improve it, and I'll send to Mel or put in
a git tree if people find it useful.

>
> > Now, on to the patch. I pushed some code around and narrowed the
> > problem down to select_idle_sibling() We have cores going into and out
> > of idle fast enough that even this cut our latencies in half:
> >
> > static int select_idle_sibling(struct task_struct *p, int target)
> > goto next;
> >
> > for_each_cpu(i, sched_group_cpus(sg)) {
> > - if (i == target || !idle_cpu(i))
> > + if (!idle_cpu(i))
> > goto next;
> > }
> >
> > IOW, by the time we get down to for_each_cpu(), the idle_cpu() check
> > done at the top of the function is no longer valid.
>
> Yeah. The problem is that because we're racing with the cpu going in
> and out of idle, and since you're exploiting that race condition, this
> is highly tuned to your specific workload.
>
> Which is a roundabout way of saying, this is probably going to
> negatively impact other workloads.
>
> > I tried a few variations on select_idle_sibling() that preserved the
> > underlying goal of returning idle cores before idle SMT threads. They
> > were all horrible in different ways, and none of them were fast.
>
> I toyed with ignoring cpu_idle() in select_idle_sibling() for my
> workload. That actually was faster ;)
>
> > The patch below just makes select_idle_sibling pick the first idle
> > thread it can find. When I ran it through production workloads here, it
> > was faster than the patch we've been carrying around for the last few
> > years.
>
> It would be really nice if we had a lightweight way to gauge the
> "idleness" of a cpu, and whether we expect it to be idle again soon.
>
> Failing that, could we just force the task onto 'target' when it makes
> sense and skip the idle search (and the race) altogether?

To me it feels like the search for a full free core is impossible. The boxes
are intentionally loaded to the point where a full core is never going
to be free. So we need this loop to quickly pick a good candidate and
move on.

The benchmark is using ~30ms of CPU time in each worker thread, so
picking a CPU with a busy worker thread is going to have a pretty big
penalty. Just grabbing any CPU and hoping it'll be idle soon isn't
likely to work well, and if it does that's probably a bug in my
benchmark ;)

You can see this in action by adding one (or at most two) more threads
to the command line. The p99 jumps quickly to 2ms or more.

-chris

2016-04-06 07:27:34

by Mike Galbraith

[permalink] [raw]
Subject: Re: [PATCH RFC] select_idle_sibling experiments

> On Tue, 2016-04-05 at 14:08 -0400, Chris Mason wrote:

> Now, on to the patch. I pushed some code around and narrowed the
> problem down to select_idle_sibling() We have cores going into and out
> of idle fast enough that even this cut our latencies in half:

Are you using NO_HZ? If so, you may want to try the attached.

> static int select_idle_sibling(struct task_struct *p, int target)
> goto next;
>
> for_each_cpu(i, sched_group_cpus(sg)) {
> - if (i == target || !idle_cpu(i))
> + if (!idle_cpu(i))
> goto next;
> }
>
> IOW, by the time we get down to for_each_cpu(), the idle_cpu() check
> done at the top of the function is no longer valid.

Ok, that's only an optimization, could go if it's causing trouble.

> I tried a few variations on select_idle_sibling() that preserved the
> underlying goal of returning idle cores before idle SMT threads. They
> were all horrible in different ways, and none of them were fast.
>
> The patch below just makes select_idle_sibling pick the first idle
> thread it can find. When I ran it through production workloads here, it
> was faster than the patch we've been carrying around for the last few
> years.
>
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 56b7d4b..c41baa6 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4974,7 +4974,6 @@ find_idlest_cpu(struct sched_group *group,
> struct task_struct *p, int this_cpu)
> static int select_idle_sibling(struct task_struct *p, int target)
> {
> struct sched_domain *sd;
> - struct sched_group *sg;
> int i = task_cpu(p);
>
> if (idle_cpu(target))
> @@ -4990,24 +4989,14 @@ static int select_idle_sibling(struct
> task_struct *p, int target)
> * Otherwise, iterate the domains and find an elegible idle
> cpu.
> */
> sd = rcu_dereference(per_cpu(sd_llc, target));
> - for_each_lower_domain(sd) {
> - sg = sd->groups;
> - do {
> - if
> (!cpumask_intersects(sched_group_cpus(sg),
> - tsk_cpus_allowed(p))
> )
> - goto next;
> -
> - for_each_cpu(i, sched_group_cpus(sg)) {
> - if (i == target || !idle_cpu(i))
> - goto next;
> - }
> + if (!sd)
> + goto done;
>
> - target =
> cpumask_first_and(sched_group_cpus(sg),
> - tsk_cpus_allowed(p));
> + for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed)
> {
> + if (cpu_active(i) && idle_cpu(i)) {
> + target = i;
> goto done;
> -next:
> - sg = sg->next;
> - } while (sg != sd->groups);
> + }
> }
> done:
> return target;
>

Ew. That may improve your latency is everything load, but worst case
package walk will hurt like hell on CPUs with insane number of threads.
That full search also turns the evil face of two-faced little
select_idle_sibling() into it's only face, the one that bounces tasks
about much more than they appreciate.

Looking for an idle core first delivers the most throughput boost, and
only looking at target's threads if you don't find one keeps the bounce
and traverse pain down to a dull roar, while at least trying to get
that latency win. To me, your patch looks like it trades harm to many,
for good for a few.

A behavior switch would be better. It can't get any dumber, but trying
to make it smarter makes it too damn fat. As it sits, it's aiming in
the general direction of the bullseye.. and occasionally hits the wall.

-Mike


Attachments:
sched-throttle-nohz.patch (1.50 kB)

2016-04-06 13:36:50

by Chris Mason

[permalink] [raw]
Subject: Re: [PATCH RFC] select_idle_sibling experiments

On Wed, Apr 06, 2016 at 09:27:24AM +0200, Mike Galbraith wrote:
> > On Tue, 2016-04-05 at 14:08 -0400, Chris Mason wrote:
>
> > Now, on to the patch. I pushed some code around and narrowed the
> > problem down to select_idle_sibling() We have cores going into and out
> > of idle fast enough that even this cut our latencies in half:
>
> Are you using NO_HZ? If so, you may want to try the attached.

I'll definitely give it a shot. When I tried using the nohz idle bitmap
(Peter's idea) instead of the for_each_cpu() walks, it came out slower.
It feels like the cpus aren't getting all the way down into the idle
loop before more work comes, but I'll have to check.

>
> > static int select_idle_sibling(struct task_struct *p, int target)
> > goto next;
> >
> > for_each_cpu(i, sched_group_cpus(sg)) {
> > - if (i == target || !idle_cpu(i))
> > + if (!idle_cpu(i))
> > goto next;
> > }
> >
> > IOW, by the time we get down to for_each_cpu(), the idle_cpu() check
> > done at the top of the function is no longer valid.
>
> Ok, that's only an optimization, could go if it's causing trouble.

It's more an indication of how long we're spending in the current scan.
Long enough for the tests we're currently doing to be inaccurate.

[ my beautiful patch ]

> Ew. That may improve your latency is everything load, but worst case
> package walk will hurt like hell on CPUs with insane number of threads.
>
> That full search also turns the evil face of two-faced little
> select_idle_sibling() into it's only face, the one that bounces tasks
> about much more than they appreciate.
>
> Looking for an idle core first delivers the most throughput boost, and
> only looking at target's threads if you don't find one keeps the bounce
> and traverse pain down to a dull roar, while at least trying to get
> that latency win. To me, your patch looks like it trades harm to many,
> for good for a few.

Yes, I'm tossing an important optimization. The goal wasn't to get rid
of that at all, but instead to find a way to get both. I just ran out
of ideas ;)

>
> A behavior switch would be better. It can't get any dumber, but trying
> to make it smarter makes it too damn fat. As it sits, it's aiming in
> the general direction of the bullseye.. and occasionally hits the wall.
>
> -Mike
>
> sched: ratelimit nohz
>
> Entering nohz code on every micro-idle is too expensive to bear.

This I really like. I'll setup a benchmark in production with it and
come back with results.

-chris

2016-04-07 15:18:04

by Chris Mason

[permalink] [raw]
Subject: Re: [PATCH RFC] select_idle_sibling experiments

On Tue, Apr 05, 2016 at 02:08:22PM -0400, Chris Mason wrote:
> Hi everyone,
>
> We're porting the fb kernel up to 4.5, and one of our last few out-of-tree
> patches is a hack to try harder to find idle cpus when waking up tasks.
> This helps in pretty much every workload we run, mostly because they all
> get tuned with a similar setup:
>
> 1) find the load where latencies stop being acceptable
> 2) Run the server at just a little less than that
>
> Usually this means our CPUs are just a little bit idle, and a poor
> scheduler decision to place a task on a busy CPU instead of an idle CPU
> ends up impacting our p99 latencies.
>
> Mike helped us with this last year, fixing up wake_wide() to improve
> things. But we still ended up having to go back to the old hack.
>
> I started with a small-ish program to benchmark wakeup latencies. The
> basic idea is a bunch of worker threads who sit around and burn CPU.
> Every once and a while they send a message to a message thread.
>
> The message thread records the time he woke up the worker, and the
> worker records the delta between that time and the time he actually got
> the CPU again. At the end it spits out a latency histogram. The only
> thing we record is the wakeup latency, there's no measurement of 'work
> done' or any of the normal things you'd expect in a benchmark.
>
> It has knobs for cpu think time, and for how long the messenger thread
> waits before replying. Here's how I'm running it with my patch:
>
> ./schbench -c 30000 -s 30000 -m 6 -t 24 -r 30

FYI, I changed schbench around a bit, and fixed a bug with -m (it was
ignored and always forced to 2).

The new code is here:

https://git.kernel.org/cgit/linux/kernel/git/mason/schbench.git/

I added a pipe simulation mode too, since I wanted wakeup latencies for
a raw tput test as well as my original workload. -p takes the size of
the transfer you want to simulate. There's no pipe involved, it's just
doing memsets on pages and waking the other thread with futexes. The
latency is still only the latency of the worker thread wakeup:

# taskset -c 0 ./schbench -p 4 -m 1 -t 1 -r 20
Latency percentiles (usec)
50.0000th: 1
75.0000th: 2
90.0000th: 2
95.0000th: 2
*99.0000th: 2
99.5000th: 2
99.9000th: 6
Over=0, min=0, max=43
avg worker transfer: 372311.38 ops/sec 1.42MB/s

# taskset -c 0 perf bench sched pipe -l 5000000
# Running 'sched/pipe' benchmark:
# Executed 5000000 pipe operations between two processes

Total time: 20.359 [sec]

4.071851 usecs/op
245588 ops/sec

I'm taking another stab at fixing the regression for picking an idle
core in my first patch, and I'll get some benchmark's with Mike's nohz
patch going as well.

-chris

2016-04-09 17:31:13

by Chris Mason

[permalink] [raw]
Subject: Re: [PATCH RFC] select_idle_sibling experiments

On Wed, Apr 06, 2016 at 09:27:24AM +0200, Mike Galbraith wrote:
> > On Tue, 2016-04-05 at 14:08 -0400, Chris Mason wrote:
>
> > Now, on to the patch. I pushed some code around and narrowed the
> > problem down to select_idle_sibling() We have cores going into and out
> > of idle fast enough that even this cut our latencies in half:
>
> Are you using NO_HZ? If so, you may want to try the attached.

[ nohz throttling patch ]

I tested the nohz throttle two different ways, first with schbench's
pipe simulation, it's easily 8% faster with messages bouncing between
cpus.

In production it's hard to pick a single number because the benchmarks
produce latency curves as the workload scales up in RPS. The benefits
range from 2-9% depending on the metric. It's a nice win, and I'd love to
see it go in.

-chris

2016-04-09 19:06:30

by Chris Mason

[permalink] [raw]
Subject: sched: tweak select_idle_sibling to look for idle threads

select_task_rq_fair() can leave cpu utilization a little lumpy,
especially as the workload ramps up to the maximum capacity of the
machine. The end result can be high p99 response times as apps
wait to get scheduled, even when boxes are mostly idle.

I wrote schbench to try and measure this:

git://git.kernel.org/pub/scm/linux/kernel/git/mason/schbench.git

The basic idea is to record the latency between when a thread is kicked
and when it actually gets the CPU. For this patch I used a simple model
where a thread thinks for a while and then waits for data from another
thread. The command line below will start two messenger threads with 18
workers per messenger:

./schbench -m 2 -t 18 -s 30000 -c 30000 -r 30
Latency percentiles (usec)
50.0000th: 52
75.0000th: 63
90.0000th: 74
95.0000th: 80
*99.0000th: 118
99.5000th: 707
99.9000th: 5016
Over=0, min=0, max=12941

p99 numbers here are acceptable, but you can see the curve starting.
if I use 19 workers per messenger, p99 goes through the roof. This
machine has two sockets, 10 cores each, so with HT on, this commandline
has one pthread on each CPU:

./schbench -m 2 -t 19 -s 30000 -c 30000 -r 30
Latency percentiles (usec)
50.0000th: 51
75.0000th: 63
90.0000th: 76
95.0000th: 89
*99.0000th: 2132
99.5000th: 6920
99.9000th: 12752
Over=0, min=0, max=17079

This commit tries to solve things by doing an extra scan in
select_idle_sibling(). If it can find an idle sibling in any core in the
package, it will return that:

./schbench -m2 -t 19 -c 30000 -s 30000 -r 30
Latency percentiles (usec)
50.0000th: 65
75.0000th: 104
90.0000th: 115
95.0000th: 118
*99.0000th: 124
99.5000th: 127
99.9000th: 262
Over=0, min=0, max=12987

This basically means the whole fleet can have one more pthread per socket
and still maintain acceptable latencies. I can actually go up to -t 20,
but it's not as consistent:

./schbench -m2 -t 20 -c 30000 -s 30000 -r 30
Latency percentiles (usec)
50.0000th: 50
75.0000th: 63
90.0000th: 75
95.0000th: 81
*99.0000th: 111
99.5000th: 975
99.9000th: 12464
Over=0, min=0, max=18317

This does preserve the existing logic to prefer idle cores over idle
CPU threads, and includes some tests to try and avoid the idle scan when we're
actually better off sharing a non-idle CPU with someone else.

Benchmarks in production show overall capacity going up between 2-5%
depending on the metric.

Credits to Arun Sharma <[email protected]> for initial versions of this
patch.

Signed-off-by: Chris Mason <[email protected]>

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 56b7d4b..2c47240 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4969,11 +4969,34 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
}

/*
+ * helper for select_idle_sibling to decide if it should look for idle
+ * threads
+ */
+static int bounce_to_target(struct task_struct *p, int cpu)
+{
+ s64 delta;
+
+ /*
+ * as the run queue gets bigger, its more and more likely that
+ * balance will have distributed things for us, and less likely
+ * that scanning all our CPUs for an idle one will find one.
+ * So, if nr_running > 1, just call this CPU good enough
+ */
+ if (cpu_rq(cpu)->cfs.nr_running > 1)
+ return 1;
+
+ /* taken from task_hot() */
+ delta = rq_clock_task(task_rq(p)) - p->se.exec_start;
+ return delta < (s64)sysctl_sched_migration_cost;
+}
+
+/*
* Try and locate an idle CPU in the sched_domain.
*/
static int select_idle_sibling(struct task_struct *p, int target)
{
struct sched_domain *sd;
+ struct sched_domain *package_sd;
struct sched_group *sg;
int i = task_cpu(p);

@@ -4989,7 +5012,8 @@ static int select_idle_sibling(struct task_struct *p, int target)
/*
* Otherwise, iterate the domains and find an elegible idle cpu.
*/
- sd = rcu_dereference(per_cpu(sd_llc, target));
+ package_sd = rcu_dereference(per_cpu(sd_llc, target));
+ sd = package_sd;
for_each_lower_domain(sd) {
sg = sd->groups;
do {
@@ -4998,7 +5022,12 @@ static int select_idle_sibling(struct task_struct *p, int target)
goto next;

for_each_cpu(i, sched_group_cpus(sg)) {
- if (i == target || !idle_cpu(i))
+ /*
+ * we tested target for idle up above,
+ * but don't skip it here because it might
+ * have raced to idle while we were scanning
+ */
+ if (!idle_cpu(i))
goto next;
}

@@ -5009,6 +5038,24 @@ next:
sg = sg->next;
} while (sg != sd->groups);
}
+
+ /*
+ * we're here because we didn't find an idle core, or an idle sibling
+ * in the target core. For message bouncing workloads, we want to
+ * just stick with the target suggestion from the caller, but
+ * otherwise we'd rather have an idle CPU from anywhere else in
+ * the package.
+ */
+ if (package_sd && !bounce_to_target(p, target)) {
+ for_each_cpu_and(i, sched_domain_span(package_sd),
+ tsk_cpus_allowed(p)) {
+ if (idle_cpu(i)) {
+ target = i;
+ break;
+ }
+
+ }
+ }
done:
return target;
}

2016-04-10 10:04:25

by Mike Galbraith

[permalink] [raw]
Subject: Re: sched: tweak select_idle_sibling to look for idle threads

On Sat, 2016-04-09 at 15:05 -0400, Chris Mason wrote:

> This does preserve the existing logic to prefer idle cores over idle
> CPU threads, and includes some tests to try and avoid the idle scan when we're
> actually better off sharing a non-idle CPU with someone else.

My box says the "oh nevermind" checks aren't selective enough, tbench
dropped 4% at clients=cores, and 2% at clients=threads.

> Benchmarks in production show overall capacity going up between 2-5%
> depending on the metric.

Latency rules all loads certainly exist, and clearly want some love,
but the bigger the socket, and the more threads/core, the more that
traverse is gonna hurt the others, so seems either we need a better
filter, or a (yeah yeah, yet another damn) tweakable.

Oh, and bounce_to_target() seems an odd way to say full_traverse.

-Mike

2016-04-10 12:36:25

by Chris Mason

[permalink] [raw]
Subject: Re: sched: tweak select_idle_sibling to look for idle threads

On Sun, Apr 10, 2016 at 12:04:21PM +0200, Mike Galbraith wrote:
> On Sat, 2016-04-09 at 15:05 -0400, Chris Mason wrote:
>
> > This does preserve the existing logic to prefer idle cores over idle
> > CPU threads, and includes some tests to try and avoid the idle scan when we're
> > actually better off sharing a non-idle CPU with someone else.
>
> My box says the "oh nevermind" checks aren't selective enough, tbench
> dropped 4% at clients=cores, and 2% at clients=threads.

I knew this part would need more experimentation, so I kept v1 as simple
as possible. On my box, tbench clients=cores is 5% faster,
clients=threads is 4% faster. bounce_to_target() is a small version of
task_hot(), I did get more accurate decisions by using the full
task_hot(), so I can try that again.

I'm testing on one of these:

Intel(R) Xeon(R) CPU E5-2660 v2 @ 2.20GHz, which has two sockets and 10
cores per socket.

What are you testing with? If it's two sockets or less I may be able to
find one to reproduce with.

>
> > Benchmarks in production show overall capacity going up between 2-5%
> > depending on the metric.
>
> Latency rules all loads certainly exist, and clearly want some love,
> but the bigger the socket, and the more threads/core, the more that
> traverse is gonna hurt the others, so seems either we need a better
> filter, or a (yeah yeah, yet another damn) tweakable.
>
> Oh, and bounce_to_target() seems an odd way to say full_traverse.

Sure, I can rename it.

-chris

2016-04-10 12:46:59

by Mike Galbraith

[permalink] [raw]
Subject: Re: sched: tweak select_idle_sibling to look for idle threads

On Sun, 2016-04-10 at 08:35 -0400, Chris Mason wrote:

> What are you testing with? If it's two sockets or less I may be able to
> find one to reproduce with.

i4790 desktop box.

-Mike

2016-04-10 19:56:20

by Chris Mason

[permalink] [raw]
Subject: Re: sched: tweak select_idle_sibling to look for idle threads

On Sun, Apr 10, 2016 at 12:04:21PM +0200, Mike Galbraith wrote:
> On Sat, 2016-04-09 at 15:05 -0400, Chris Mason wrote:
>
> > This does preserve the existing logic to prefer idle cores over idle
> > CPU threads, and includes some tests to try and avoid the idle scan when we're
> > actually better off sharing a non-idle CPU with someone else.
>
> My box says the "oh nevermind" checks aren't selective enough, tbench
> dropped 4% at clients=cores, and 2% at clients=threads.

Ok, I was able to reproduce this by stuffing tbench_srv and tbench onto
just socket 0. Version 2 below fixes things for me, but I'm hoping
someone can suggest a way to get task_hot() buddy checks without the rq
lock.

I haven't run this on production loads yet, but our 4.0 patch for this
uses task_hot(), so I'd expect it to be on par. If this doesn't fix it
for you, I'll dig up a similar machine on Monday.

-chris

>From 306f7a2019b341d11c9713be83f41b2261994f44 Mon Sep 17 00:00:00 2001
From: Chris Mason <[email protected]>
Date: Fri, 8 Apr 2016 13:18:20 -0700
Subject: [PATCH v2] sched: tweak select_idle_sibling to look for idle threads

select_task_rq_fair() can leave cpu utilization a little lumpy,
especially as the workload ramps up to the maximum capacity of the
machine. The end result can be high p99 response times as apps
wait to get scheduled, even when boxes are mostly idle.

I wrote schbench to try and measure this:

git://git.kernel.org/pub/scm/linux/kernel/git/mason/schbench.git

The basic idea is to record the latency between when a thread is kicked
and when it actually gets the CPU. For this patch I used a simple model
where a thread thinks for a while and then waits for data from another
thread. The command line below will start two messenger threads with 18
workers per messenger:

./schbench -m 2 -t 18 -s 30000 -c 30000 -r 30
Latency percentiles (usec)
50.0000th: 52
75.0000th: 63
90.0000th: 74
95.0000th: 80
*99.0000th: 118
99.5000th: 707
99.9000th: 5016
Over=0, min=0, max=12941

p99 numbers here are acceptable. But if I use 19 workers per messenger,
p99 goes through the roof. This machine has two sockets, 10 cores each,
so with HT on, this commandline has one pthread on each CPU:

./schbench -m 2 -t 19 -s 30000 -c 30000 -r 30
Latency percentiles (usec)
50.0000th: 51
75.0000th: 63
90.0000th: 76
95.0000th: 89
*99.0000th: 2132
99.5000th: 6920
99.9000th: 12752
Over=0, min=0, max=17079

This commit tries to solve things by doing an extra scan in
select_idle_sibling().select_idle_sibling If it can find an idle
sibling in any core in the package, it will return that:

./schbench -m2 -t 19 -c 30000 -s 30000 -r 30
Latency percentiles (usec)
50.0000th: 65
75.0000th: 104
90.0000th: 115
95.0000th: 118
*99.0000th: 124
99.5000th: 127
99.9000th: 262
Over=0, min=0, max=12987

This basically means the whole fleet can have one more pthread per socket
and still maintain acceptable latencies. I can actually go up to -t 20,
but the curve starts getting steep.

./schbench -m2 -t 20 -c 30000 -s 30000 -r 30
Latency percentiles (usec)
50.0000th: 50
75.0000th: 63
90.0000th: 75
95.0000th: 81
*99.0000th: 111
99.5000th: 975
99.9000th: 12464
Over=0, min=0, max=18317

This does preserve the existing logic to prefer idle cores over idle
CPU threads, and includes some tests to try and avoid the idle scan when we're
actually better off sharing a non-idle CPU with someone else.

Benchmarks in production show average duration of a request is 13%
faster, with overall capacity going up between 2-5% depending on the
metric.

Credits to Arun Sharma <[email protected]> for initial versions of this
patch.

Signed-off-by: Chris Mason <[email protected]>
---
v1 -> v2 call task_hot to decide if we should scan for idle cpus
kernel/sched/fair.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 57 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 56b7d4b..0c76505 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -658,6 +658,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
#ifdef CONFIG_SMP
static int select_idle_sibling(struct task_struct *p, int cpu);
static unsigned long task_h_load(struct task_struct *p);
+static int should_scan_idle(struct task_struct *p, int cpu);

/*
* We choose a half-life close to 1 scheduling period.
@@ -4974,6 +4975,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
static int select_idle_sibling(struct task_struct *p, int target)
{
struct sched_domain *sd;
+ struct sched_domain *package_sd;
struct sched_group *sg;
int i = task_cpu(p);

@@ -4989,7 +4991,8 @@ static int select_idle_sibling(struct task_struct *p, int target)
/*
* Otherwise, iterate the domains and find an elegible idle cpu.
*/
- sd = rcu_dereference(per_cpu(sd_llc, target));
+ package_sd = rcu_dereference(per_cpu(sd_llc, target));
+ sd = package_sd;
for_each_lower_domain(sd) {
sg = sd->groups;
do {
@@ -4998,7 +5001,12 @@ static int select_idle_sibling(struct task_struct *p, int target)
goto next;

for_each_cpu(i, sched_group_cpus(sg)) {
- if (i == target || !idle_cpu(i))
+ /*
+ * we tested target for idle up above,
+ * but don't skip it here because it might
+ * have raced to idle while we were scanning
+ */
+ if (!idle_cpu(i))
goto next;
}

@@ -5009,6 +5017,24 @@ next:
sg = sg->next;
} while (sg != sd->groups);
}
+
+ /*
+ * we're here because we didn't find an idle core, or an idle sibling
+ * in the target core. For message bouncing workloads, we want to
+ * just stick with the target suggestion from the caller, but
+ * otherwise we'd rather have an idle CPU from anywhere else in
+ * the package.
+ */
+ if (package_sd && should_scan_idle(p, target)) {
+ for_each_cpu_and(i, sched_domain_span(package_sd),
+ tsk_cpus_allowed(p)) {
+ if (idle_cpu(i)) {
+ target = i;
+ break;
+ }
+
+ }
+ }
done:
return target;
}
@@ -5714,6 +5740,35 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
return delta < (s64)sysctl_sched_migration_cost;
}

+/*
+ * helper for select_idle_sibling to decide if it should look for idle
+ * threads
+ */
+static int should_scan_idle(struct task_struct *p, int cpu)
+{
+ unsigned long flags;
+ struct lb_env env;
+ int hot;
+
+ /*
+ * as the run queue gets bigger, its more and more likely that
+ * balance will have distributed things for us, and less likely
+ * that scanning all our CPUs for an idle one will find one.
+ * So, if nr_running > 1, just call this CPU good enough
+ */
+ if (cpu_rq(cpu)->cfs.nr_running > 1)
+ return 0;
+
+ env.src_rq = task_rq(p);
+ env.dst_rq = cpu_rq(cpu);
+ raw_spin_lock_irqsave(&env.src_rq->lock, flags);
+ hot = task_hot(p, &env);
+ raw_spin_unlock_irqrestore(&env.src_rq->lock, flags);
+
+ return hot == 0;
+}
+
+
#ifdef CONFIG_NUMA_BALANCING
/*
* Returns 1, if task migration degrades locality
--
2.8.0.rc2

2016-04-11 04:54:25

by Mike Galbraith

[permalink] [raw]
Subject: Re: sched: tweak select_idle_sibling to look for idle threads

On Sun, 2016-04-10 at 15:55 -0400, Chris Mason wrote:
> On Sun, Apr 10, 2016 at 12:04:21PM +0200, Mike Galbraith wrote:
> > On Sat, 2016-04-09 at 15:05 -0400, Chris Mason wrote:
> >
> > > This does preserve the existing logic to prefer idle cores over idle
> > > CPU threads, and includes some tests to try and avoid the idle scan when we're
> > > actually better off sharing a non-idle CPU with someone else.
> >
> > My box says the "oh nevermind" checks aren't selective enough, tbench
> > dropped 4% at clients=cores, and 2% at clients=threads.
>
> Ok, I was able to reproduce this by stuffing tbench_srv and tbench onto
> just socket 0. Version 2 below fixes things for me, but I'm hoping
> someone can suggest a way to get task_hot() buddy checks without the rq
> lock.
>
> I haven't run this on production loads yet, but our 4.0 patch for this
> uses task_hot(), so I'd expect it to be on par. If this doesn't fix it
> for you, I'll dig up a similar machine on Monday.

My box stopped caring. I personally would be reluctant to apply it
without a "you asked for it" button or a large pile of benchmark
results. Lock banging or not, full scan existing makes me nervous.

-Mike

2016-04-12 00:31:19

by Chris Mason

[permalink] [raw]
Subject: Re: sched: tweak select_idle_sibling to look for idle threads

On Mon, Apr 11, 2016 at 06:54:21AM +0200, Mike Galbraith wrote:
> On Sun, 2016-04-10 at 15:55 -0400, Chris Mason wrote:
> > On Sun, Apr 10, 2016 at 12:04:21PM +0200, Mike Galbraith wrote:
> > > On Sat, 2016-04-09 at 15:05 -0400, Chris Mason wrote:
> > >
> > > > This does preserve the existing logic to prefer idle cores over idle
> > > > CPU threads, and includes some tests to try and avoid the idle scan when we're
> > > > actually better off sharing a non-idle CPU with someone else.
> > >
> > > My box says the "oh nevermind" checks aren't selective enough, tbench
> > > dropped 4% at clients=cores, and 2% at clients=threads.
> >
> > Ok, I was able to reproduce this by stuffing tbench_srv and tbench onto
> > just socket 0. Version 2 below fixes things for me, but I'm hoping
> > someone can suggest a way to get task_hot() buddy checks without the rq
> > lock.
> >
> > I haven't run this on production loads yet, but our 4.0 patch for this
> > uses task_hot(), so I'd expect it to be on par. If this doesn't fix it
> > for you, I'll dig up a similar machine on Monday.
>
> My box stopped caring. I personally would be reluctant to apply it
> without a "you asked for it" button or a large pile of benchmark
> results. Lock banging or not, full scan existing makes me nervous.


We can use a bitmap at the socket level to keep track of which cpus are
idle. I'm sure there are better places for the array and better ways to
allocate, this is just a rough cut to make sure the idle tracking works.

-chris

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a10494a..1c3b5e4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1055,6 +1055,8 @@ struct sched_domain {
unsigned int balance_interval; /* initialise to 1. units in ms. */
unsigned int nr_balance_failed; /* initialise to 0 */

+ cpumask_var_t idle_cpus_mask;
+
/* idle_balance() stats */
u64 max_newidle_lb_cost;
unsigned long next_decay_max_lb_cost;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 41f6b22..237d645 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3204,6 +3204,7 @@ again:
static void __sched notrace __schedule(bool preempt)
{
struct task_struct *prev, *next;
+ struct sched_domain *package_sd;
unsigned long *switch_count;
struct rq *rq;
int cpu;
@@ -3270,11 +3270,19 @@ static void __sched notrace __schedule(bool preempt)
update_rq_clock(rq);

next = pick_next_task(rq, prev);
+
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
rq->clock_skip_update = 0;

if (likely(prev != next)) {
+ package_sd = rcu_dereference(per_cpu(sd_llc, cpu));
+ if (package_sd) {
+ if (prev->policy == SCHED_IDLE && next->policy != SCHED_IDLE)
+ cpumask_clear_cpu(cpu, package_sd->idle_cpus_mask);
+ else if (next->policy == SCHED_IDLE)
+ cpumask_set_cpu(cpu, package_sd->idle_cpus_mask);
+ }
rq->nr_switches++;
rq->curr = next;
++*switch_count;
@@ -6599,7 +6607,6 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
sd->imbalance_pct = 117;
sd->cache_nice_tries = 1;
sd->busy_idx = 2;
-
#ifdef CONFIG_NUMA
} else if (sd->flags & SD_NUMA) {
sd->cache_nice_tries = 2;
@@ -7041,6 +7048,8 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
return child;

cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
+ zalloc_cpumask_var(&sd->idle_cpus_mask, GFP_NOWAIT);
+ cpumask_and(sd->idle_cpus_mask, cpu_map, tl->mask(cpu));
if (child) {
sd->level = child->level + 1;
sched_domain_level_max = max(sched_domain_level_max, sd->level);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0c76505..cae6bd7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5026,7 +5026,7 @@ next:
* the package.
*/
if (package_sd && should_scan_idle(p, target)) {
- for_each_cpu_and(i, sched_domain_span(package_sd),
+ for_each_cpu_and(i, package_sd->idle_cpus_mask,
tsk_cpus_allowed(p)) {
if (idle_cpu(i)) {
target = i;
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 544a713..7e34b42 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -202,6 +202,9 @@ DEFINE_PER_CPU(bool, cpu_dead_idle);
*/
static void cpu_idle_loop(void)
{
+ int cpu;
+ struct sched_domain *package_sd;
+
while (1) {
/*
* If the arch has a polling bit, we maintain an invariant:
@@ -212,10 +215,19 @@ static void cpu_idle_loop(void)
* guaranteed to cause the cpu to reschedule.
*/

+
__current_set_polling();
quiet_vmstat();
tick_nohz_idle_enter();

+ preempt_disable();
+ cpu = smp_processor_id();
+ package_sd = rcu_dereference(per_cpu(sd_llc, cpu));
+ if (package_sd) {
+ cpumask_set_cpu(cpu, package_sd->idle_cpus_mask);
+ }
+ preempt_enable();
+
while (!need_resched()) {
check_pgt_cache();
rmb();
--
2.8.0.rc2

2016-04-12 04:44:12

by Mike Galbraith

[permalink] [raw]
Subject: Re: sched: tweak select_idle_sibling to look for idle threads

On Mon, 2016-04-11 at 20:30 -0400, Chris Mason wrote:
> On Mon, Apr 11, 2016 at 06:54:21AM +0200, Mike Galbraith wrote:

> > > Ok, I was able to reproduce this by stuffing tbench_srv and tbench onto
> > > just socket 0. Version 2 below fixes things for me, but I'm hoping
> > > someone can suggest a way to get task_hot() buddy checks without the rq
> > > lock.
> > >
> > > I haven't run this on production loads yet, but our 4.0 patch for this
> > > uses task_hot(), so I'd expect it to be on par. If this doesn't fix it
> > > for you, I'll dig up a similar machine on Monday.
> >
> > My box stopped caring. I personally would be reluctant to apply it
> > without a "you asked for it" button or a large pile of benchmark
> > results. Lock banging or not, full scan existing makes me nervous.
>
>
> We can use a bitmap at the socket level to keep track of which cpus are
> idle. I'm sure there are better places for the array and better ways to
> allocate, this is just a rough cut to make sure the idle tracking works.

See e0a79f529d5b:

pre 15.22 MB/sec 1 procs
post 252.01 MB/sec 1 procs

You can make traverse cycles go away, but those cycles, while precious,
are not the most costly cycles. The above was 1 tbench pair in an
otherwise idle box.. ie it wasn't traverse cycles that demolished it.

-Mike

(p.s. SCHED_IDLE is dinky bandwidth fair class)

2016-04-12 13:28:38

by Chris Mason

[permalink] [raw]
Subject: Re: sched: tweak select_idle_sibling to look for idle threads

On Tue, Apr 12, 2016 at 06:44:08AM +0200, Mike Galbraith wrote:
> On Mon, 2016-04-11 at 20:30 -0400, Chris Mason wrote:
> > On Mon, Apr 11, 2016 at 06:54:21AM +0200, Mike Galbraith wrote:
>
> > > > Ok, I was able to reproduce this by stuffing tbench_srv and tbench onto
> > > > just socket 0. Version 2 below fixes things for me, but I'm hoping
> > > > someone can suggest a way to get task_hot() buddy checks without the rq
> > > > lock.
> > > >
> > > > I haven't run this on production loads yet, but our 4.0 patch for this
> > > > uses task_hot(), so I'd expect it to be on par. If this doesn't fix it
> > > > for you, I'll dig up a similar machine on Monday.
> > >
> > > My box stopped caring. I personally would be reluctant to apply it
> > > without a "you asked for it" button or a large pile of benchmark
> > > results. Lock banging or not, full scan existing makes me nervous.
> >
> >
> > We can use a bitmap at the socket level to keep track of which cpus are
> > idle. I'm sure there are better places for the array and better ways to
> > allocate, this is just a rough cut to make sure the idle tracking works.
>
> See e0a79f529d5b:
>
> pre 15.22 MB/sec 1 procs
> post 252.01 MB/sec 1 procs
>
> You can make traverse cycles go away, but those cycles, while precious,
> are not the most costly cycles. The above was 1 tbench pair in an
> otherwise idle box.. ie it wasn't traverse cycles that demolished it.

Agreed, this is why the decision not to scan is so important. But while
I've been describing this patch in terms of latency, latency is really
the symptom instead of the goal. Without these patches, workloads that
do want to fully utilize the hardware are basically getting one fewer
core of utilization. It's true that we define 'fully utilize' with an
upper bound on application response time, but we're not talking high
frequency trading here.

It clearly shows up in our graphs. CPU idle is higher (the lost core),
CPU user time is lower, average system load is higher (procs waiting on
a fewer number of core).

We measure this internally with scheduling latency because that's the
easiest way to talk about it across a wide variety of hardware.

>
> -Mike
>
> (p.s. SCHED_IDLE is dinky bandwidth fair class)

Ugh, not my best quick patch, but you get the idea I was going for. I
can always add the tunable to flip things on/off but I'd prefer that we
find a good set of defaults, mostly so the FB production runtime is the
common config instead of the special snowflake.

-chris

2016-04-12 18:16:21

by Mike Galbraith

[permalink] [raw]
Subject: Re: sched: tweak select_idle_sibling to look for idle threads

On Tue, 2016-04-12 at 09:27 -0400, Chris Mason wrote:
> I
> can always add the tunable to flip things on/off but I'd prefer that we
> find a good set of defaults, mostly so the FB production runtime is the
> common config instead of the special snowflake.

Yeah, generic has a much better chance to actually get merged, just
need a very solid chain on the lurking beast from hell. Hm...

The last time we went through this, the problem child was the waker of
many in your load. With tiny twiddle to wake_wide(), all was allegedly
well, or at least that's the impression I was left with. That leads me
to a pseudo-random thought: iff that waker of many is still at the
root, you could try using wake_wide() as the full search trigger, which
should shrink the attack surface available to the horror-from-hell
quite a lot. Just a thought.

-Mike

2016-04-12 20:08:09

by Chris Mason

[permalink] [raw]
Subject: Re: sched: tweak select_idle_sibling to look for idle threads

On Tue, Apr 12, 2016 at 08:16:17PM +0200, Mike Galbraith wrote:
> On Tue, 2016-04-12 at 09:27 -0400, Chris Mason wrote:
> > I
> > can always add the tunable to flip things on/off but I'd prefer that we
> > find a good set of defaults, mostly so the FB production runtime is the
> > common config instead of the special snowflake.
>
> Yeah, generic has a much better chance to actually get merged, just
> need a very solid chain on the lurking beast from hell. Hm...
>
> The last time we went through this, the problem child was the waker of
> many in your load. With tiny twiddle to wake_wide(), all was allegedly
> well, or at least that's the impression I was left with. That leads me
> to a pseudo-random thought: iff that waker of many is still at the
> root, you could try using wake_wide() as the full search trigger, which
> should shrink the attack surface available to the horror-from-hell
> quite a lot. Just a thought.

I think that if we're worried about the cost of the idle scan for this
workload, find_idlest_group() is either going to hurt much more, or not
search enough CPUs to find the idle one.

But I'm happy to try patches or other ideas, I have a fixed version of
the bitmap one going through production benchmarks now.

-chris

2016-04-12 21:45:17

by Matt Fleming

[permalink] [raw]
Subject: Re: [PATCH RFC] select_idle_sibling experiments

On Sat, 09 Apr, at 01:30:34PM, Chris Mason wrote:
>
> [ nohz throttling patch ]
>
> I tested the nohz throttle two different ways, first with schbench's
> pipe simulation, it's easily 8% faster with messages bouncing between
> cpus.
>
> In production it's hard to pick a single number because the benchmarks
> produce latency curves as the workload scales up in RPS. The benefits
> range from 2-9% depending on the metric. It's a nice win, and I'd love to
> see it go in.

Do we have any idea what the tradeoff is against power consumption for
throttling nohz?

2016-04-13 03:19:01

by Mike Galbraith

[permalink] [raw]
Subject: Re: sched: tweak select_idle_sibling to look for idle threads

On Tue, 2016-04-12 at 16:07 -0400, Chris Mason wrote:

> I think that if we're worried about the cost of the idle scan for this
> workload, find_idlest_group() is either going to hurt much more, or not
> search enough CPUs to find the idle one.

find_idlest_group()? No no no, that's not what I mean at all.

wake_wide() identifies loads that really want to spread out, thus turns
off affine wakeups. We still call select_idle_sibling(), only
difference being that target is the original cpu, not the waking cpu.
Given making that wide connection bidirectional helped FB's load, it
seems reasonable that passing wide information to select_idle_sibling()
would have a good chance of hitting the candidate that stands to gain
from a full socket scan, while also keeping that cache scrambling scan
far away from the rest.

> But I'm happy to try patches or other ideas, I have a fixed version of
> the bitmap one going through production benchmarks now.

Making that wide/full search cheap is still good, because wake_wide()
also identifies interrupt sources that are waking many, so cheap wide
search should increase utilization there as well. The thought was to
just make the wide thing have a tad wider effect on what it already
does affect.. and hope that doesn't demolish anything.

-Mike

2016-04-13 03:40:24

by Mike Galbraith

[permalink] [raw]
Subject: Re: [PATCH RFC] select_idle_sibling experiments

On Tue, 2016-04-12 at 22:45 +0100, Matt Fleming wrote:
> On Sat, 09 Apr, at 01:30:34PM, Chris Mason wrote:
> >
> > [ nohz throttling patch ]
> >
> > I tested the nohz throttle two different ways, first with schbench's
> > pipe simulation, it's easily 8% faster with messages bouncing between
> > cpus.
> >
> > In production it's hard to pick a single number because the benchmarks
> > produce latency curves as the workload scales up in RPS. The benefits
> > range from 2-9% depending on the metric. It's a nice win, and I'd love to
> > see it go in.
>
> Do we have any idea what the tradeoff is against power consumption for
> throttling nohz?

That's measurable with the built in super duper watt meter gizmo
(turbostat). It should be dinky but existent, could be given an off
button for particularly attentive laptop drivers to poke. Servers
drivers are unlikely to care given the performance win.

-Mike

2016-04-13 13:48:10

by Chris Mason

[permalink] [raw]
Subject: Re: sched: tweak select_idle_sibling to look for idle threads

On Wed, Apr 13, 2016 at 05:18:51AM +0200, Mike Galbraith wrote:
> On Tue, 2016-04-12 at 16:07 -0400, Chris Mason wrote:
>
> > I think that if we're worried about the cost of the idle scan for this
> > workload, find_idlest_group() is either going to hurt much more, or not
> > search enough CPUs to find the idle one.
>
> find_idlest_group()? No no no, that's not what I mean at all.
>
> wake_wide() identifies loads that really want to spread out, thus turns
> off affine wakeups. We still call select_idle_sibling(), only
> difference being that target is the original cpu, not the waking cpu.

Ah ok, I see what you mean now.

> Given making that wide connection bidirectional helped FB's load, it
> seems reasonable that passing wide information to select_idle_sibling()
> would have a good chance of hitting the candidate that stands to gain
> from a full socket scan, while also keeping that cache scrambling scan
> far away from the rest.
>
> > But I'm happy to try patches or other ideas, I have a fixed version of
> > the bitmap one going through production benchmarks now.

[ benchmarks say it needs more fixing, ick ]

>
> Making that wide/full search cheap is still good, because wake_wide()
> also identifies interrupt sources that are waking many, so cheap wide
> search should increase utilization there as well. The thought was to
> just make the wide thing have a tad wider effect on what it already
> does affect.. and hope that doesn't demolish anything.

So you're interested in numbers where we pass the wake_wide decision
into select_idle_sibling(), and then use that instead of (or in addition
to?) my should_scan_idle() function?

I agree we may need to tweak wake_wide, since most of our wakeups now
are failed affine wakeups. But, the differences are in p99, so I'll
probably need to get some better metrics.

-chris

2016-04-13 14:23:02

by Mike Galbraith

[permalink] [raw]
Subject: Re: sched: tweak select_idle_sibling to look for idle threads

On Wed, 2016-04-13 at 09:44 -0400, Chris Mason wrote:

> So you're interested in numbers where we pass the wake_wide decision
> into select_idle_sibling(), and then use that instead of (or in addition
> to?) my should_scan_idle() function?

Yeah, I was thinking instead of, and hoping that would be enough.

> I agree we may need to tweak wake_wide, since most of our wakeups now
> are failed affine wakeups.

What exactly do you mean by failed affine wakeups? Failed because
wake_wide() said we don't want one, or because wake_affine() said we
can't have one? If the later, my thought bubble may have just burst,
but it still "feels" right.

-Mike

2016-04-13 14:37:13

by Chris Mason

[permalink] [raw]
Subject: Re: sched: tweak select_idle_sibling to look for idle threads

On Wed, Apr 13, 2016 at 04:22:58PM +0200, Mike Galbraith wrote:
> On Wed, 2016-04-13 at 09:44 -0400, Chris Mason wrote:
>
> > So you're interested in numbers where we pass the wake_wide decision
> > into select_idle_sibling(), and then use that instead of (or in addition
> > to?) my should_scan_idle() function?
>
> Yeah, I was thinking instead of, and hoping that would be enough.

I'm definitely up for experimenting with different tests to decide when
to scan idle. I'll have to wait until after lsf/vault, but I can layout
a bunch of tests.

>
> > I agree we may need to tweak wake_wide, since most of our wakeups now
> > are failed affine wakeups.
>
> What exactly do you mean by failed affine wakeups? Failed because
> wake_wide() said we don't want one, or because wake_affine() said we
> can't have one? If the later, my thought bubble may have just burst,
> but it still "feels" right.

I mean this number:

schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);

Is much much much higher than this number:

schedstat_inc(p, se.statistics.nr_wakeups_affine);

So, wake_affine said we can't have one. I made a script to sum it up
across all the threads of the webserver workload.

-chris

2016-04-13 15:05:52

by Mike Galbraith

[permalink] [raw]
Subject: Re: sched: tweak select_idle_sibling to look for idle threads

On Wed, 2016-04-13 at 10:36 -0400, Chris Mason wrote:
> On Wed, Apr 13, 2016 at 04:22:58PM +0200, Mike Galbraith wrote:

> > What exactly do you mean by failed affine wakeups? Failed because
> > wake_wide() said we don't want one, or because wake_affine() said we
> > can't have one? If the later, my thought bubble may have just burst,
> > but it still "feels" right.
>
> I mean this number:
>
> schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
>
> Is much much much higher than this number:
>
> schedstat_inc(p, se.statistics.nr_wakeups_affine);
>
> So, wake_affine said we can't have one. I made a script to sum it up
> across all the threads of the webserver workload.

Hm, ok, that doesn't really tell us more than there's more to the load
than the 1:N bits that wake_wide() apparently did identify fairly well
last go, so targeting them still might help.

-Mike

2016-04-13 15:34:12

by Mike Galbraith

[permalink] [raw]
Subject: Re: sched: tweak select_idle_sibling to look for idle threads

Another thing you could try is looking at your avg_idle, and twiddling
sched_migration_cost_ns to crank up idle balancing a bit.

-Mike

2016-04-13 15:56:37

by Chris Mason

[permalink] [raw]
Subject: Re: [PATCH RFC] select_idle_sibling experiments

On Wed, Apr 13, 2016 at 05:40:20AM +0200, Mike Galbraith wrote:
> On Tue, 2016-04-12 at 22:45 +0100, Matt Fleming wrote:
> > On Sat, 09 Apr, at 01:30:34PM, Chris Mason wrote:
> > >
> > > [ nohz throttling patch ]
> > >
> > > I tested the nohz throttle two different ways, first with schbench's
> > > pipe simulation, it's easily 8% faster with messages bouncing between
> > > cpus.
> > >
> > > In production it's hard to pick a single number because the benchmarks
> > > produce latency curves as the workload scales up in RPS. The benefits
> > > range from 2-9% depending on the metric. It's a nice win, and I'd love to
> > > see it go in.
> >
> > Do we have any idea what the tradeoff is against power consumption for
> > throttling nohz?
>
> That's measurable with the built in super duper watt meter gizmo
> (turbostat). It should be dinky but existent, could be given an off
> button for particularly attentive laptop drivers to poke. Servers
> drivers are unlikely to care given the performance win.

Our power sensors show its basically a wash during the production
benchmark runs. Which makes sense because its really only blinking
on/off at very high frequency.

-chris

2016-04-28 12:00:18

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH RFC] select_idle_sibling experiments

On Wed, Apr 06, 2016 at 09:27:24AM +0200, Mike Galbraith wrote:
> sched: ratelimit nohz
>
> Entering nohz code on every micro-idle is too expensive to bear.
>
> Signed-off-by: Mike Galbraith <[email protected]>

> +int sched_needs_cpu(int cpu)
> +{
> + if (tick_nohz_full_cpu(cpu))
> + return 0;
> +
> + return cpu_rq(cpu)->avg_idle < sysctl_sched_migration_cost;

So the only problem I have with this patch is the choice of limit. This
isn't at all tied to the migration cost.

And some people are already twiddling with the migration_cost knob to
affect the idle_balance() behaviour -- making it much more agressive by
dialing it down. When you do that you also loose the effectiveness of
this proposed usage, even though those same people would probably want
this.

Failing a spot of inspiration for a runtime limit on this; we might have
to introduce yet another knob :/

2016-04-28 13:17:49

by Mike Galbraith

[permalink] [raw]
Subject: Re: [PATCH RFC] select_idle_sibling experiments

On Thu, 2016-04-28 at 14:00 +0200, Peter Zijlstra wrote:
> On Wed, Apr 06, 2016 at 09:27:24AM +0200, Mike Galbraith wrote:
> > sched: ratelimit nohz
> >
> > Entering nohz code on every micro-idle is too expensive to bear.
> >
> > Signed-off-by: Mike Galbraith <[email protected]>
>
> > +int sched_needs_cpu(int cpu)
> > +{
> > +> > > > if (tick_nohz_full_cpu(cpu))
> > +> > > > > > return 0;
> > +
> > +> > > > return cpu_rq(cpu)->avg_idle < sysctl_sched_migration_cost;
>
> So the only problem I have with this patch is the choice of limit. This
> isn't at all tied to the migration cost.

Yup.

> And some people are already twiddling with the migration_cost knob to
> affect the idle_balance() behaviour -- making it much more agressive by
> dialing it down. When you do that you also loose the effectiveness of
> this proposed usage, even though those same people would probably want
> this.
>
> Failing a spot of inspiration for a runtime limit on this; we might have
> to introduce yet another knob :/

I'll roll one with a yet another of it's very own.

-Mike

2016-04-30 12:47:45

by Peter Zijlstra

[permalink] [raw]
Subject: Re: sched: tweak select_idle_sibling to look for idle threads

On Sat, Apr 09, 2016 at 03:05:54PM -0400, Chris Mason wrote:
> select_task_rq_fair() can leave cpu utilization a little lumpy,
> especially as the workload ramps up to the maximum capacity of the
> machine. The end result can be high p99 response times as apps
> wait to get scheduled, even when boxes are mostly idle.
>
> I wrote schbench to try and measure this:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/mason/schbench.git

Can you guys have a play with this; I think one and two node tbench are
good, but I seem to be getting significant run to run variance on that,
so maybe I'm not doing it right.

schbench numbers with: ./schbench -m2 -t 20 -c 30000 -s 30000 -r 30
on my ivb-ep (2 sockets, 10 cores/socket, 2 threads/core) appear to be
decent.

I've also not ran anything other than schbench/tbench so maybe I
completely wrecked something else (as per usual..).

I've not thought about that bounce_to_target() thing much.. I'll go give
that a ponder.

---
kernel/sched/fair.c | 180 +++++++++++++++++++++++++++++++++++------------
kernel/sched/features.h | 1 +
kernel/sched/idle_task.c | 4 +-
kernel/sched/sched.h | 1 +
kernel/time/tick-sched.c | 10 +--
5 files changed, 146 insertions(+), 50 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b8a33abce650..b9d8d1dc5183 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1501,8 +1501,10 @@ static void task_numa_compare(struct task_numa_env *env,
* One idle CPU per node is evaluated for a task numa move.
* Call select_idle_sibling to maybe find a better one.
*/
- if (!cur)
+ if (!cur) {
+ // XXX borken
env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
+ }

assign:
assigned = true;
@@ -4491,6 +4493,17 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
}

#ifdef CONFIG_SMP
+
+/*
+ * Working cpumask for:
+ * load_balance,
+ * load_balance_newidle,
+ * select_idle_core.
+ *
+ * Assumes softirqs are disabled when in use.
+ */
+DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
+
#ifdef CONFIG_NO_HZ_COMMON
/*
* per rq 'load' arrray crap; XXX kill this.
@@ -5162,65 +5175,147 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
}

+#ifdef CONFIG_SCHED_SMT
+
+static inline void clear_idle_cores(int cpu)
+{
+ struct sched_domain *sd = rcu_dereference(per_cpu(sd_busy, cpu));
+ if (!sd)
+ return;
+
+ WRITE_ONCE(sd->groups->sgc->has_idle_cores, 0);
+}
+
+static inline void set_idle_cores(int cpu)
+{
+ struct sched_domain *sd = rcu_dereference(per_cpu(sd_busy, cpu));
+ if (!sd)
+ return;
+
+ WRITE_ONCE(sd->groups->sgc->has_idle_cores, 1);
+}
+
+static inline bool test_idle_cores(int cpu)
+{
+ struct sched_domain *sd = rcu_dereference(per_cpu(sd_busy, cpu));
+ if (!sd)
+ return false;
+
+ // XXX static key for !SMT topologies
+
+ return READ_ONCE(sd->groups->sgc->has_idle_cores);
+}
+
+void update_idle_core(struct rq *rq)
+{
+ int core = cpu_of(rq);
+ int cpu;
+
+ rcu_read_lock();
+ if (test_idle_cores(core))
+ goto unlock;
+
+ for_each_cpu(cpu, cpu_smt_mask(core)) {
+ if (cpu == core)
+ continue;
+
+ if (!idle_cpu(cpu))
+ goto unlock;
+ }
+
+ set_idle_cores(core);
+unlock:
+ rcu_read_unlock();
+}
+
+static int select_idle_core(struct task_struct *p, int target)
+{
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
+ struct sched_domain *sd;
+ int core, cpu;
+
+ sd = rcu_dereference(per_cpu(sd_llc, target));
+ cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p));
+ for_each_cpu(core, cpus) {
+ bool idle = true;
+
+ for_each_cpu(cpu, cpu_smt_mask(core)) {
+ cpumask_clear_cpu(cpu, cpus);
+ if (!idle_cpu(cpu))
+ idle = false;
+ }
+
+ if (idle)
+ break;
+ }
+
+ return core;
+}
+
+#else /* CONFIG_SCHED_SMT */
+
+static inline void clear_idle_cores(int cpu) { }
+static inline void set_idle_cores(int cpu) { }
+
+static inline bool test_idle_cores(int cpu)
+{
+ return false;
+}
+
+void update_idle_core(struct rq *rq) { }
+
+static inline int select_idle_core(struct task_struct *p, int target)
+{
+ return -1;
+}
+
+#endif /* CONFIG_SCHED_SMT */
+
/*
- * Try and locate an idle CPU in the sched_domain.
+ * Try and locate an idle core/thread in the LLC cache domain.
*/
static int select_idle_sibling(struct task_struct *p, int target)
{
struct sched_domain *sd;
- struct sched_group *sg;
int i = task_cpu(p);

if (idle_cpu(target))
return target;

/*
- * If the prevous cpu is cache affine and idle, don't be stupid.
+ * If the previous cpu is cache affine and idle, don't be stupid.
*/
if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
return i;

+ sd = rcu_dereference(per_cpu(sd_llc, target));
+ if (!sd)
+ return target;
+
/*
- * Otherwise, iterate the domains and find an eligible idle cpu.
- *
- * A completely idle sched group at higher domains is more
- * desirable than an idle group at a lower level, because lower
- * domains have smaller groups and usually share hardware
- * resources which causes tasks to contend on them, e.g. x86
- * hyperthread siblings in the lowest domain (SMT) can contend
- * on the shared cpu pipeline.
- *
- * However, while we prefer idle groups at higher domains
- * finding an idle cpu at the lowest domain is still better than
- * returning 'target', which we've already established, isn't
- * idle.
+ * If there are idle cores to be had, go find one.
*/
- sd = rcu_dereference(per_cpu(sd_llc, target));
- for_each_lower_domain(sd) {
- sg = sd->groups;
- do {
- if (!cpumask_intersects(sched_group_cpus(sg),
- tsk_cpus_allowed(p)))
- goto next;
-
- /* Ensure the entire group is idle */
- for_each_cpu(i, sched_group_cpus(sg)) {
- if (i == target || !idle_cpu(i))
- goto next;
- }
+ if (sched_feat(IDLE_CORE) && test_idle_cores(target)) {
+ i = select_idle_core(p, target);
+ if ((unsigned)i < nr_cpumask_bits)
+ return i;

- /*
- * It doesn't matter which cpu we pick, the
- * whole group is idle.
- */
- target = cpumask_first_and(sched_group_cpus(sg),
- tsk_cpus_allowed(p));
- goto done;
-next:
- sg = sg->next;
- } while (sg != sd->groups);
+ /*
+ * Failed to find an idle core; stop looking for one.
+ */
+ clear_idle_cores(target);
}
-done:
+
+ /*
+ * Otherwise, settle for anything idle in this cache domain.
+ */
+ for_each_cpu(i, sched_domain_span(sd)) {
+ if (!cpumask_test_cpu(i, tsk_cpus_allowed(p)))
+ continue;
+ if (idle_cpu(i))
+ return i;
+ }
+
return target;
}

@@ -7229,9 +7324,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
*/
#define MAX_PINNED_INTERVAL 512

-/* Working cpumask for load_balance and load_balance_newidle. */
-DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
-
static int need_active_balance(struct lb_env *env)
{
struct sched_domain *sd = env->sd;
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 69631fa46c2f..76bb8814649a 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -69,3 +69,4 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
SCHED_FEAT(LB_MIN, false)
SCHED_FEAT(ATTACH_AGE_LOAD, true)

+SCHED_FEAT(IDLE_CORE, true)
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 47ce94931f1b..cb394db407e4 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -23,11 +23,13 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
resched_curr(rq);
}

+extern void update_idle_core(struct rq *rq);
+
static struct task_struct *
pick_next_task_idle(struct rq *rq, struct task_struct *prev)
{
put_prev_task(rq, prev);
-
+ update_idle_core(rq);
schedstat_inc(rq, sched_goidle);
return rq->idle;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 69da6fcaa0e8..5994794bfc85 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -866,6 +866,7 @@ struct sched_group_capacity {
* Number of busy cpus in this group.
*/
atomic_t nr_busy_cpus;
+ int has_idle_cores;

unsigned long cpumask[0]; /* iteration mask */
};
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 31872bc53bc4..6e42cd218ba5 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -933,11 +933,11 @@ void tick_nohz_idle_enter(void)
WARN_ON_ONCE(irqs_disabled());

/*
- * Update the idle state in the scheduler domain hierarchy
- * when tick_nohz_stop_sched_tick() is called from the idle loop.
- * State will be updated to busy during the first busy tick after
- * exiting idle.
- */
+ * Update the idle state in the scheduler domain hierarchy
+ * when tick_nohz_stop_sched_tick() is called from the idle loop.
+ * State will be updated to busy during the first busy tick after
+ * exiting idle.
+ */
set_cpu_sd_state_idle();

local_irq_disable();