2021-07-30 15:40:04

by Riccardo Mancini

[permalink] [raw]
Subject: [RFC PATCH v2 10/10] perf synthetic-events: use workqueue parallel_for

To generate synthetic events, perf has the option to use multiple
threads. These threads are created manually using pthread_created.

This patch replaces the manual pthread_create with a workqueue,
using the parallel_for utility.

Experimental results show that workqueue has a slightly higher overhead,
but this is repayed by the improved work balancing among threads.

Results of perf bench before and after are reported below:
Command: sudo ./perf bench internals synthesize -t
Average synthesis time in usec is reported.

Laptop (2 cores 4 threads i7), avg num events ~21500:
N pthread (before) workqueue (after)
1 121475.200 +- 2227.757 118882.900 +- 1389.398
2 72834.100 +- 1860.677 67668.600 +- 2847.693
3 70650.200 +- 540.096 55694.200 +- 496.155
4 55554.300 +- 259.968 50901.400 +- 434.327

VM (16 vCPU over 16 cores 32 threads Xeon), avg num events ~2920:
N pthread (before) workqueue (after)
1 35182.400 +- 3561.189 37528.300 +- 2972.887
2 29188.400 +- 2191.767 28250.300 +- 1694.575
3 22172.200 +- 788.659 19062.400 +- 611.201
4 21600.700 +- 728.941 16812.900 +- 1085.359
5 19395.800 +- 1070.617 14764.600 +- 1339.113
6 18553.000 +- 1272.486 12814.200 +- 408.462
7 14691.400 +- 485.105 12382.200 +- 464.964
8 16036.400 +- 842.728 15015.000 +- 1648.844
9 15606.800 +- 470.100 13230.800 +- 1288.246
10 15527.000 +- 822.317 12661.800 +- 873.199
11 13097.400 +- 513.870 13082.700 +- 974.378
12 14053.700 +- 592.427 13123.400 +- 1054.939
13 15446.400 +- 765.850 12837.200 +- 770.646
14 14979.400 +- 1056.955 13695.400 +- 1066.302
15 12578.000 +- 846.142 15053.600 +- 992.118
16 12394.800 +- 602.295 13683.700 +- 911.517

Signed-off-by: Riccardo Mancini <[email protected]>
---
tools/perf/util/synthetic-events.c | 155 +++++++++++++++--------------
1 file changed, 81 insertions(+), 74 deletions(-)

diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c
index 35aa0c0f7cd955b2..3fcda677e100b3ae 100644
--- a/tools/perf/util/synthetic-events.c
+++ b/tools/perf/util/synthetic-events.c
@@ -22,6 +22,7 @@
#include <linux/string.h>
#include <linux/zalloc.h>
#include <linux/perf_event.h>
+#include <linux/err.h>
#include <asm/bug.h>
#include <perf/evsel.h>
#include <perf/cpumap.h>
@@ -41,6 +42,7 @@
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
+#include "util/workqueue/workqueue.h"

#define DEFAULT_PROC_MAP_PARSE_TIMEOUT 500

@@ -882,16 +884,13 @@ static int __perf_event__synthesize_threads(struct perf_tool *tool,
perf_event__handler_t process,
struct machine *machine,
bool mmap_data,
- struct dirent **dirent,
- int start,
- int num)
+ char *d_name)
{
union perf_event *comm_event, *mmap_event, *fork_event;
union perf_event *namespaces_event;
int err = -1;
char *end;
pid_t pid;
- int i;

comm_event = malloc(sizeof(comm_event->comm) + machine->id_hdr_size);
if (comm_event == NULL)
@@ -911,24 +910,22 @@ static int __perf_event__synthesize_threads(struct perf_tool *tool,
if (namespaces_event == NULL)
goto out_free_fork;

- for (i = start; i < start + num; i++) {
- if (!isdigit(dirent[i]->d_name[0]))
- continue;
+ if (!isdigit(d_name[0]))
+ goto out_free_namespaces;

- pid = (pid_t)strtol(dirent[i]->d_name, &end, 10);
- /* only interested in proper numerical dirents */
- if (*end)
- continue;
- /*
- * We may race with exiting thread, so don't stop just because
- * one thread couldn't be synthesized.
- */
- __event__synthesize_thread(comm_event, mmap_event, fork_event,
- namespaces_event, pid, 1, process,
- tool, machine, mmap_data);
- }
+ pid = (pid_t)strtol(d_name, &end, 10);
+ /* only interested in proper numerical dirents */
+ if (*end)
+ goto out_free_namespaces;
+ /*
+ * We may race with exiting thread, so don't stop just because
+ * one thread couldn't be synthesized.
+ */
+ __event__synthesize_thread(comm_event, mmap_event, fork_event,
+ namespaces_event, pid, 1, process,
+ tool, machine, mmap_data);
err = 0;
-
+out_free_namespaces:
free(namespaces_event);
out_free_fork:
free(fork_event);
@@ -946,19 +943,15 @@ struct synthesize_threads_arg {
struct machine *machine;
bool mmap_data;
struct dirent **dirent;
- int num;
- int start;
};

-static void *synthesize_threads_worker(void *arg)
+static void synthesize_threads_worker(int i, void *arg)
{
struct synthesize_threads_arg *args = arg;

__perf_event__synthesize_threads(args->tool, args->process,
args->machine, args->mmap_data,
- args->dirent,
- args->start, args->num);
- return NULL;
+ args->dirent[i]->d_name);
}

int perf_event__synthesize_threads(struct perf_tool *tool,
@@ -967,15 +960,15 @@ int perf_event__synthesize_threads(struct perf_tool *tool,
bool mmap_data,
unsigned int nr_threads_synthesize)
{
- struct synthesize_threads_arg *args = NULL;
- pthread_t *synthesize_threads = NULL;
+ struct synthesize_threads_arg args;
char proc_path[PATH_MAX];
struct dirent **dirent;
- int num_per_thread;
- int m, n, i, j;
+ int n, i;
int thread_nr;
- int base = 0;
- int err = -1;
+ int err = -1, ret;
+ struct threadpool *pool;
+ struct workqueue_struct *wq;
+ char err_buf[WORKQUEUE_STRERR_BUFSIZE];


if (machine__is_default_guest(machine))
@@ -992,54 +985,68 @@ int perf_event__synthesize_threads(struct perf_tool *tool,
thread_nr = nr_threads_synthesize;

if (thread_nr <= 1) {
- err = __perf_event__synthesize_threads(tool, process,
- machine, mmap_data,
- dirent, base, n);
+ for (i = 0; i < n; i++)
+ err = __perf_event__synthesize_threads(tool, process,
+ machine, mmap_data,
+ dirent[i]->d_name);
goto free_dirent;
}
- if (thread_nr > n)
- thread_nr = n;

- synthesize_threads = calloc(sizeof(pthread_t), thread_nr);
- if (synthesize_threads == NULL)
+ pool = threadpool__new(thread_nr);
+ if (IS_ERR(pool)) {
+ ret = threadpool__new_strerror(pool, err_buf, sizeof(err_buf));
+ pr_err("threadpool__new: %s\n",
+ ret ? "Error generating error msg" : err_buf);
goto free_dirent;
-
- args = calloc(sizeof(*args), thread_nr);
- if (args == NULL)
- goto free_threads;
-
- num_per_thread = n / thread_nr;
- m = n % thread_nr;
- for (i = 0; i < thread_nr; i++) {
- args[i].tool = tool;
- args[i].process = process;
- args[i].machine = machine;
- args[i].mmap_data = mmap_data;
- args[i].dirent = dirent;
- }
- for (i = 0; i < m; i++) {
- args[i].num = num_per_thread + 1;
- args[i].start = i * args[i].num;
- }
- if (i != 0)
- base = args[i-1].start + args[i-1].num;
- for (j = i; j < thread_nr; j++) {
- args[j].num = num_per_thread;
- args[j].start = base + (j - i) * args[i].num;
}

- for (i = 0; i < thread_nr; i++) {
- if (pthread_create(&synthesize_threads[i], NULL,
- synthesize_threads_worker, &args[i]))
- goto out_join;
- }
- err = 0;
-out_join:
- for (i = 0; i < thread_nr; i++)
- pthread_join(synthesize_threads[i], NULL);
- free(args);
-free_threads:
- free(synthesize_threads);
+ err = threadpool__start(pool);
+ if (err) {
+ ret = threadpool__strerror(pool, err, err_buf, sizeof(err_buf));
+ pr_err("threadpool__start: %s\n",
+ ret ? "Error generating error msg" : err_buf);
+ goto free_pool;
+ }
+
+ wq = create_workqueue(pool);
+ if (IS_ERR(wq)) {
+ ret = create_workqueue_strerror(wq, err_buf, sizeof(err_buf));
+ pr_err("create_workqueue: %s\n",
+ ret ? "Error generating error msg" : err_buf);
+ goto stop_pool;
+ }
+
+ args.tool = tool;
+ args.process = process;
+ args.machine = machine;
+ args.mmap_data = mmap_data;
+ args.dirent = dirent;
+
+ ret = parallel_for(wq, 0, n, 1, synthesize_threads_worker, &args);
+ if (ret) {
+ ret = workqueue_strerror(wq, ret, err_buf, sizeof(err_buf));
+ pr_err("parallel_for: %s\n",
+ ret ? "Error generating error msg" : err_buf);
+ err = ret;
+ }
+
+ ret = destroy_workqueue(wq);
+ if (ret) {
+ ret = destroy_workqueue_strerror(ret, err_buf, sizeof(err_buf));
+ pr_err("destroy_workqueue: %s\n",
+ ret ? "Error generating error msg" : err_buf);
+ err = ret;
+ }
+stop_pool:
+ ret = threadpool__stop(pool);
+ if (ret) {
+ ret = threadpool__strerror(pool, ret, err_buf, sizeof(err_buf));
+ pr_err("threadpool__stop: %s\n",
+ ret ? "Error generating error msg" : err_buf);
+ err = ret;
+ }
+free_pool:
+ threadpool__delete(pool);
free_dirent:
for (i = 0; i < n; i++)
zfree(&dirent[i]);
--
2.31.1



2021-08-09 13:29:34

by Riccardo Mancini

[permalink] [raw]
Subject: Re: [RFC PATCH v2 10/10] perf synthetic-events: use workqueue parallel_for

Hi Jiri,
thanks for looking into this patchset.

On Mon, 2021-08-09 at 14:04 +0200, Jiri Olsa wrote:
> On Fri, Jul 30, 2021 at 05:34:17PM +0200, Riccardo Mancini wrote:
> > To generate synthetic events, perf has the option to use multiple
> > threads. These threads are created manually using pthread_created.
> >
> > This patch replaces the manual pthread_create with a workqueue,
> > using the parallel_for utility.
>
> hi,
> I really like this new interface
>
> >
> > Experimental results show that workqueue has a slightly higher overhead,
> > but this is repayed by the improved work balancing among threads.
>
> how did you measure that balancing improvement?
> is there less kernel cycles spent?

I meant that the workqueue with the shared queue is able to balance work among
threads. This is particulary important in synthesize since low pids are
associated to kthreads, which require less processing (as far as I understand),
therefore the current work assignment is not great.

I think the goal of the workqueue is not to be faster than the current
implementation (which it is only due to the aforementioned issue), but to have a
better abstraction with a contained overhead.

>
> I ran the benchmark and if I'm reading the results correctly I see
> performance drop for high cpu numbers (full list attached below).

The implementation with the shared queue suffers from this problem, but I hoped
it would hold up to more threads.
I'm working on having one queue per thread, in order to be able to scale better
on more cpus. I do not have any workstealing at the moment so there is no
rebalancing of work, but in our usecases it is not that important, at the
moment.
You can find it on https://github.com/Manciukic/linux.git in perf/workqueue/dev
branch. If you can run the same benchmark there, it would be really helpful for
me.

Thanks,
Riccardo

>
>
> old perf:                                                                 new
> perf:
>
> [jolsa@dell-r440-01 perf]$ ./perf.old bench internals synthesize -t      
> [jolsa@dell-r440-01 perf]$ ./perf bench internals synthesize -t
> ...
>   Number of synthesis threads: 40                                          
> Number of synthesis threads: 40
>     Average synthesis took: 2489.400 usec (+- 49.832 usec)                   
> Average synthesis took: 4576.500 usec (+- 75.278 usec)
>     Average num. events: 956.800 (+- 6.721)                                  
> Average num. events: 1020.000 (+- 0.000)
>     Average time per event 2.602 usec                                        
> Average time per event 4.487 usec
>
> maybe profiling will show what's going on?
>
> thanks,
> jirka
>
>
> ---
> [jolsa@dell-r440-01 perf]$ ./perf.old bench internals synthesize -t      
> [jolsa@dell-r440-01 perf]$ ./perf bench internals synthesize -t
> # Running 'internals/synthesize' benchmark:                               #
> Running 'internals/synthesize' benchmark:
> Computing performance of multi threaded perf event synthesis by          
> Computing performance of multi threaded perf event synthesis by
> synthesizing events on CPU 0:                                            
> synthesizing events on CPU 0:
>   Number of synthesis threads: 1                                           
> Number of synthesis threads: 1
>     Average synthesis took: 7907.100 usec (+- 197.363 usec)                  
> Average synthesis took: 7972.900 usec (+- 198.158 usec)
>     Average num. events: 956.000 (+- 0.000)                                  
> Average num. events: 936.000 (+- 0.000)
>     Average time per event 8.271 usec                                        
> Average time per event 8.518 usec
>   Number of synthesis threads: 2                                           
> Number of synthesis threads: 2
>     Average synthesis took: 5616.800 usec (+- 61.253 usec)                   
> Average synthesis took: 5844.700 usec (+- 87.219 usec)
>     Average num. events: 958.800 (+- 0.327)                                  
> Average num. events: 940.000 (+- 0.000)
>     Average time per event 5.858 usec                                        
> Average time per event 6.218 usec
>   Number of synthesis threads: 3                                           
> Number of synthesis threads: 3
>     Average synthesis took: 4274.000 usec (+- 93.293 usec)                   
> Average synthesis took: 4019.700 usec (+- 67.354 usec)
>     Average num. events: 962.000 (+- 0.000)                                  
> Average num. events: 942.000 (+- 0.000)
>     Average time per event 4.443 usec                                        
> Average time per event 4.267 usec
>   Number of synthesis threads: 4                                           
> Number of synthesis threads: 4
>     Average synthesis took: 3425.700 usec (+- 43.044 usec)                   
> Average synthesis took: 3382.200 usec (+- 74.652 usec)
>     Average num. events: 959.600 (+- 0.933)                                  
> Average num. events: 944.000 (+- 0.000)
>     Average time per event 3.570 usec                                        
> Average time per event 3.583 usec
>   Number of synthesis threads: 5                                           
> Number of synthesis threads: 5
>     Average synthesis took: 2958.000 usec (+- 82.951 usec)                   
> Average synthesis took: 3086.500 usec (+- 48.213 usec)
>     Average num. events: 966.000 (+- 0.000)                                  
> Average num. events: 946.000 (+- 0.000)
>     Average time per event 3.062 usec                                        
> Average time per event 3.263 usec
>   Number of synthesis threads: 6                                           
> Number of synthesis threads: 6
>     Average synthesis took: 2808.400 usec (+- 66.868 usec)                   
> Average synthesis took: 2752.200 usec (+- 56.411 usec)
>     Average num. events: 956.800 (+- 0.327)                                  
> Average num. events: 948.000 (+- 0.000)
>     Average time per event 2.935 usec                                        
> Average time per event 2.903 usec
>   Number of synthesis threads: 7                                           
> Number of synthesis threads: 7
>     Average synthesis took: 2622.900 usec (+- 83.524 usec)                   
> Average synthesis took: 2548.200 usec (+- 48.042 usec)
>     Average num. events: 958.400 (+- 0.267)                                  
> Average num. events: 950.000 (+- 0.000)
>     Average time per event 2.737 usec                                        
> Average time per event 2.682 usec
>   Number of synthesis threads: 8                                           
> Number of synthesis threads: 8
>     Average synthesis took: 2271.600 usec (+- 29.181 usec)                   
> Average synthesis took: 2486.600 usec (+- 47.862 usec)
>     Average num. events: 972.000 (+- 0.000)                                  
> Average num. events: 952.000 (+- 0.000)
>     Average time per event 2.337 usec                                        
> Average time per event 2.612 usec
>   Number of synthesis threads: 9                                           
> Number of synthesis threads: 9
>     Average synthesis took: 2372.000 usec (+- 95.495 usec)                   
> Average synthesis took: 2347.300 usec (+- 23.959 usec)
>     Average num. events: 959.200 (+- 0.952)                                  
> Average num. events: 954.000 (+- 0.000)
>     Average time per event 2.473 usec                                        
> Average time per event 2.460 usec
>   Number of synthesis threads: 10                                          
> Number of synthesis threads: 10
>     Average synthesis took: 2544.600 usec (+- 107.569 usec)                  
> Average synthesis took: 2328.800 usec (+- 14.234 usec)
>     Average num. events: 968.400 (+- 3.124)                                  
> Average num. events: 957.400 (+- 0.306)
>     Average time per event 2.628 usec                                        
> Average time per event 2.432 usec
>   Number of synthesis threads: 11                                          
> Number of synthesis threads: 11
>     Average synthesis took: 2299.300 usec (+- 57.597 usec)                   
> Average synthesis took: 2340.300 usec (+- 34.638 usec)
>     Average num. events: 956.000 (+- 0.000)                                  
> Average num. events: 960.000 (+- 0.000)
>     Average time per event 2.405 usec                                        
> Average time per event 2.438 usec
>   Number of synthesis threads: 12                                          
> Number of synthesis threads: 12
>     Average synthesis took: 2545.500 usec (+- 69.557 usec)                   
> Average synthesis took: 2318.700 usec (+- 15.803 usec)
>     Average num. events: 974.800 (+- 0.611)                                  
> Average num. events: 963.800 (+- 0.200)
>     Average time per event 2.611 usec                                        
> Average time per event 2.406 usec
>   Number of synthesis threads: 13                                          
> Number of synthesis threads: 13
>     Average synthesis took: 2386.400 usec (+- 79.244 usec)                   
> Average synthesis took: 2408.700 usec (+- 27.071 usec)
>     Average num. events: 950.500 (+- 5.726)                                  
> Average num. events: 966.000 (+- 0.000)
>     Average time per event 2.511 usec                                        
> Average time per event 2.493 usec
>   Number of synthesis threads: 14                                          
> Number of synthesis threads: 14
>     Average synthesis took: 2466.600 usec (+- 57.893 usec)                   
> Average synthesis took: 2547.200 usec (+- 53.445 usec)
>     Average num. events: 957.600 (+- 0.718)                                  
> Average num. events: 968.000 (+- 0.000)
>     Average time per event 2.576 usec                                        
> Average time per event 2.631 usec
>   Number of synthesis threads: 15                                          
> Number of synthesis threads: 15
>     Average synthesis took: 2249.700 usec (+- 64.026 usec)                   
> Average synthesis took: 2647.900 usec (+- 79.014 usec)
>     Average num. events: 956.000 (+- 0.000)                                  
> Average num. events: 970.000 (+- 0.000)
>     Average time per event 2.353 usec                                        
> Average time per event 2.730 usec
>   Number of synthesis threads: 16                                          
> Number of synthesis threads: 16
>     Average synthesis took: 2311.700 usec (+- 64.304 usec)                   
> Average synthesis took: 2676.200 usec (+- 34.824 usec)
>     Average num. events: 955.000 (+- 0.907)                                  
> Average num. events: 972.000 (+- 0.000)
>     Average time per event 2.421 usec                                        
> Average time per event 2.753 usec
>   Number of synthesis threads: 17                                          
> Number of synthesis threads: 17
>     Average synthesis took: 2174.100 usec (+- 36.673 usec)                   
> Average synthesis took: 2580.100 usec (+- 45.414 usec)
>     Average num. events: 971.600 (+- 3.124)                                  
> Average num. events: 974.000 (+- 0.000)
>     Average time per event 2.238 usec                                        
> Average time per event 2.649 usec
>   Number of synthesis threads: 18                                          
> Number of synthesis threads: 18
>     Average synthesis took: 2294.200 usec (+- 63.657 usec)                   
> Average synthesis took: 2810.200 usec (+- 49.113 usec)
>     Average num. events: 953.200 (+- 0.611)                                  
> Average num. events: 976.000 (+- 0.000)
>     Average time per event 2.407 usec                                        
> Average time per event 2.879 usec
>   Number of synthesis threads: 19                                          
> Number of synthesis threads: 19
>     Average synthesis took: 2410.700 usec (+- 120.169 usec)                  
> Average synthesis took: 2862.400 usec (+- 36.982 usec)
>     Average num. events: 953.400 (+- 0.306)                                  
> Average num. events: 978.000 (+- 0.000)
>     Average time per event 2.529 usec                                        
> Average time per event 2.927 usec
>   Number of synthesis threads: 20                                          
> Number of synthesis threads: 20
>     Average synthesis took: 2387.000 usec (+- 91.051 usec)                   
> Average synthesis took: 2908.800 usec (+- 36.404 usec)
>     Average num. events: 952.800 (+- 0.800)                                  
> Average num. events: 978.600 (+- 0.306)
>     Average time per event 2.505 usec                                        
> Average time per event 2.972 usec
>   Number of synthesis threads: 21                                          
> Number of synthesis threads: 21
>     Average synthesis took: 2275.700 usec (+- 39.815 usec)                   
> Average synthesis took: 3141.100 usec (+- 30.896 usec)
>     Average num. events: 954.600 (+- 0.306)                                  
> Average num. events: 980.000 (+- 0.000)
>     Average time per event 2.384 usec                                        
> Average time per event 3.205 usec
>   Number of synthesis threads: 22                                          
> Number of synthesis threads: 22
>     Average synthesis took: 2373.200 usec (+- 89.528 usec)                   
> Average synthesis took: 3342.400 usec (+- 112.115 usec)
>     Average num. events: 949.100 (+- 5.843)                                  
> Average num. events: 982.000 (+- 0.000)
>     Average time per event 2.500 usec                                        
> Average time per event 3.404 usec
>   Number of synthesis threads: 23                                          
> Number of synthesis threads: 23
>     Average synthesis took: 2318.300 usec (+- 39.395 usec)                   
> Average synthesis took: 3269.700 usec (+- 55.215 usec)
>     Average num. events: 954.600 (+- 0.427)                                  
> Average num. events: 984.000 (+- 0.000)
>     Average time per event 2.429 usec                                        
> Average time per event 3.323 usec
>   Number of synthesis threads: 24                                          
> Number of synthesis threads: 24
>     Average synthesis took: 2241.900 usec (+- 52.577 usec)                   
> Average synthesis took: 3379.500 usec (+- 56.380 usec)
>     Average num. events: 954.000 (+- 0.000)                                  
> Average num. events: 986.000 (+- 0.000)
>     Average time per event 2.350 usec                                        
> Average time per event 3.427 usec
>   Number of synthesis threads: 25                                          
> Number of synthesis threads: 25
>     Average synthesis took: 2343.400 usec (+- 101.611 usec)                  
> Average synthesis took: 3382.500 usec (+- 51.535 usec)
>     Average num. events: 956.200 (+- 1.009)                                  
> Average num. events: 988.000 (+- 0.000)
>     Average time per event 2.451 usec                                        
> Average time per event 3.424 usec
>   Number of synthesis threads: 26                                          
> Number of synthesis threads: 26
>     Average synthesis took: 2260.700 usec (+- 18.863 usec)                   
> Average synthesis took: 3391.600 usec (+- 44.053 usec)
>     Average num. events: 954.000 (+- 0.000)                                  
> Average num. events: 990.000 (+- 0.000)
>     Average time per event 2.370 usec                                        
> Average time per event 3.426 usec
>   Number of synthesis threads: 27                                          
> Number of synthesis threads: 27
>     Average synthesis took: 2373.800 usec (+- 74.213 usec)                   
> Average synthesis took: 3659.200 usec (+- 113.176 usec)
>     Average num. events: 955.000 (+- 0.803)                                  
> Average num. events: 992.000 (+- 0.000)
>     Average time per event 2.486 usec                                        
> Average time per event 3.689 usec
>   Number of synthesis threads: 28                                          
> Number of synthesis threads: 28
>     Average synthesis took: 2335.500 usec (+- 49.480 usec)                   
> Average synthesis took: 3625.000 usec (+- 90.131 usec)
>     Average num. events: 954.000 (+- 0.000)                                  
> Average num. events: 994.000 (+- 0.000)
>     Average time per event 2.448 usec                                        
> Average time per event 3.647 usec
>   Number of synthesis threads: 29                                          
> Number of synthesis threads: 29
>     Average synthesis took: 2182.100 usec (+- 41.649 usec)                   
> Average synthesis took: 3708.400 usec (+- 103.717 usec)
>     Average num. events: 954.000 (+- 0.000)                                  
> Average num. events: 996.000 (+- 0.000)
>     Average time per event 2.287 usec                                        
> Average time per event 3.723 usec
>   Number of synthesis threads: 30                                          
> Number of synthesis threads: 30
>     Average synthesis took: 2246.100 usec (+- 58.252 usec)                   
> Average synthesis took: 3820.500 usec (+- 95.282 usec)
>     Average num. events: 954.000 (+- 0.000)                                  
> Average num. events: 998.000 (+- 0.000)
>     Average time per event 2.354 usec                                        
> Average time per event 3.828 usec
>   Number of synthesis threads: 31                                          
> Number of synthesis threads: 31
>     Average synthesis took: 2156.900 usec (+- 26.141 usec)                   
> Average synthesis took: 3881.400 usec (+- 36.277 usec)
>     Average num. events: 948.300 (+- 5.700)                                  
> Average num. events: 1000.000 (+- 0.000)
>     Average time per event 2.274 usec                                        
> Average time per event 3.881 usec
>   Number of synthesis threads: 32                                          
> Number of synthesis threads: 32
>     Average synthesis took: 2295.300 usec (+- 41.538 usec)                   
> Average synthesis took: 4191.700 usec (+- 149.780 usec)
>     Average num. events: 954.000 (+- 0.000)                                  
> Average num. events: 1002.000 (+- 0.000)
>     Average time per event 2.406 usec                                        
> Average time per event 4.183 usec
>   Number of synthesis threads: 33                                          
> Number of synthesis threads: 33
>     Average synthesis took: 2249.100 usec (+- 59.135 usec)                   
> Average synthesis took: 3988.200 usec (+- 25.015 usec)
>     Average num. events: 948.500 (+- 5.726)                                  
> Average num. events: 1004.000 (+- 0.000)
>     Average time per event 2.371 usec                                        
> Average time per event 3.972 usec
>   Number of synthesis threads: 34                                          
> Number of synthesis threads: 34
>     Average synthesis took: 2270.400 usec (+- 65.011 usec)                   
> Average synthesis took: 4064.600 usec (+- 44.158 usec)
>     Average num. events: 954.200 (+- 0.200)                                  
> Average num. events: 1006.000 (+- 0.000)
>     Average time per event 2.379 usec                                        
> Average time per event 4.040 usec
>   Number of synthesis threads: 35                                          
> Number of synthesis threads: 35
>     Average synthesis took: 2259.200 usec (+- 44.287 usec)                   
> Average synthesis took: 4145.700 usec (+- 37.297 usec)
>     Average num. events: 954.000 (+- 0.000)                                  
> Average num. events: 1008.000 (+- 0.000)
>     Average time per event 2.368 usec                                        
> Average time per event 4.113 usec
>   Number of synthesis threads: 36                                          
> Number of synthesis threads: 36
>     Average synthesis took: 2294.100 usec (+- 38.693 usec)                   
> Average synthesis took: 4234.900 usec (+- 81.904 usec)
>     Average num. events: 954.000 (+- 0.000)                                  
> Average num. events: 1010.400 (+- 0.267)
>     Average time per event 2.405 usec                                        
> Average time per event 4.191 usec
>   Number of synthesis threads: 37                                          
> Number of synthesis threads: 37
>     Average synthesis took: 2338.900 usec (+- 80.346 usec)                   
> Average synthesis took: 4337.900 usec (+- 30.071 usec)
>     Average num. events: 954.400 (+- 0.267)                                  
> Average num. events: 1014.000 (+- 0.000)
>     Average time per event 2.451 usec                                        
> Average time per event 4.278 usec
>   Number of synthesis threads: 38                                          
> Number of synthesis threads: 38
>     Average synthesis took: 2406.300 usec (+- 57.140 usec)                   
> Average synthesis took: 4426.600 usec (+- 27.035 usec)
>     Average num. events: 938.400 (+- 7.730)                                  
> Average num. events: 1016.000 (+- 0.000)
>     Average time per event 2.564 usec                                        
> Average time per event 4.357 usec
>   Number of synthesis threads: 39                                          
> Number of synthesis threads: 39
>     Average synthesis took: 2371.000 usec (+- 35.676 usec)                   
> Average synthesis took: 5979.000 usec (+- 1518.855 usec)
>     Average num. events: 963.000 (+- 0.000)                                  
> Average num. events: 1018.000 (+- 0.000)
>     Average time per event 2.462 usec                                        
> Average time per event 5.873 usec
>   Number of synthesis threads: 40                                          
> Number of synthesis threads: 40
>     Average synthesis took: 2489.400 usec (+- 49.832 usec)                   
> Average synthesis took: 4576.500 usec (+- 75.278 usec)
>     Average num. events: 956.800 (+- 6.721)                                  
> Average num. events: 1020.000 (+- 0.000)
>     Average time per event 2.602 usec                                        
> Average time per event 4.487 usec
>


2021-08-09 14:14:53

by Jiri Olsa

[permalink] [raw]
Subject: Re: [RFC PATCH v2 10/10] perf synthetic-events: use workqueue parallel_for

On Fri, Jul 30, 2021 at 05:34:17PM +0200, Riccardo Mancini wrote:
> To generate synthetic events, perf has the option to use multiple
> threads. These threads are created manually using pthread_created.
>
> This patch replaces the manual pthread_create with a workqueue,
> using the parallel_for utility.

hi,
I really like this new interface

>
> Experimental results show that workqueue has a slightly higher overhead,
> but this is repayed by the improved work balancing among threads.

how did you measure that balancing improvement?
is there less kernel cycles spent?

I ran the benchmark and if I'm reading the results correctly I see
performance drop for high cpu numbers (full list attached below).


old perf: new perf:

[jolsa@dell-r440-01 perf]$ ./perf.old bench internals synthesize -t [jolsa@dell-r440-01 perf]$ ./perf bench internals synthesize -t
...
Number of synthesis threads: 40 Number of synthesis threads: 40
Average synthesis took: 2489.400 usec (+- 49.832 usec) Average synthesis took: 4576.500 usec (+- 75.278 usec)
Average num. events: 956.800 (+- 6.721) Average num. events: 1020.000 (+- 0.000)
Average time per event 2.602 usec Average time per event 4.487 usec

maybe profiling will show what's going on?

thanks,
jirka


---
[jolsa@dell-r440-01 perf]$ ./perf.old bench internals synthesize -t [jolsa@dell-r440-01 perf]$ ./perf bench internals synthesize -t
# Running 'internals/synthesize' benchmark: # Running 'internals/synthesize' benchmark:
Computing performance of multi threaded perf event synthesis by Computing performance of multi threaded perf event synthesis by
synthesizing events on CPU 0: synthesizing events on CPU 0:
Number of synthesis threads: 1 Number of synthesis threads: 1
Average synthesis took: 7907.100 usec (+- 197.363 usec) Average synthesis took: 7972.900 usec (+- 198.158 usec)
Average num. events: 956.000 (+- 0.000) Average num. events: 936.000 (+- 0.000)
Average time per event 8.271 usec Average time per event 8.518 usec
Number of synthesis threads: 2 Number of synthesis threads: 2
Average synthesis took: 5616.800 usec (+- 61.253 usec) Average synthesis took: 5844.700 usec (+- 87.219 usec)
Average num. events: 958.800 (+- 0.327) Average num. events: 940.000 (+- 0.000)
Average time per event 5.858 usec Average time per event 6.218 usec
Number of synthesis threads: 3 Number of synthesis threads: 3
Average synthesis took: 4274.000 usec (+- 93.293 usec) Average synthesis took: 4019.700 usec (+- 67.354 usec)
Average num. events: 962.000 (+- 0.000) Average num. events: 942.000 (+- 0.000)
Average time per event 4.443 usec Average time per event 4.267 usec
Number of synthesis threads: 4 Number of synthesis threads: 4
Average synthesis took: 3425.700 usec (+- 43.044 usec) Average synthesis took: 3382.200 usec (+- 74.652 usec)
Average num. events: 959.600 (+- 0.933) Average num. events: 944.000 (+- 0.000)
Average time per event 3.570 usec Average time per event 3.583 usec
Number of synthesis threads: 5 Number of synthesis threads: 5
Average synthesis took: 2958.000 usec (+- 82.951 usec) Average synthesis took: 3086.500 usec (+- 48.213 usec)
Average num. events: 966.000 (+- 0.000) Average num. events: 946.000 (+- 0.000)
Average time per event 3.062 usec Average time per event 3.263 usec
Number of synthesis threads: 6 Number of synthesis threads: 6
Average synthesis took: 2808.400 usec (+- 66.868 usec) Average synthesis took: 2752.200 usec (+- 56.411 usec)
Average num. events: 956.800 (+- 0.327) Average num. events: 948.000 (+- 0.000)
Average time per event 2.935 usec Average time per event 2.903 usec
Number of synthesis threads: 7 Number of synthesis threads: 7
Average synthesis took: 2622.900 usec (+- 83.524 usec) Average synthesis took: 2548.200 usec (+- 48.042 usec)
Average num. events: 958.400 (+- 0.267) Average num. events: 950.000 (+- 0.000)
Average time per event 2.737 usec Average time per event 2.682 usec
Number of synthesis threads: 8 Number of synthesis threads: 8
Average synthesis took: 2271.600 usec (+- 29.181 usec) Average synthesis took: 2486.600 usec (+- 47.862 usec)
Average num. events: 972.000 (+- 0.000) Average num. events: 952.000 (+- 0.000)
Average time per event 2.337 usec Average time per event 2.612 usec
Number of synthesis threads: 9 Number of synthesis threads: 9
Average synthesis took: 2372.000 usec (+- 95.495 usec) Average synthesis took: 2347.300 usec (+- 23.959 usec)
Average num. events: 959.200 (+- 0.952) Average num. events: 954.000 (+- 0.000)
Average time per event 2.473 usec Average time per event 2.460 usec
Number of synthesis threads: 10 Number of synthesis threads: 10
Average synthesis took: 2544.600 usec (+- 107.569 usec) Average synthesis took: 2328.800 usec (+- 14.234 usec)
Average num. events: 968.400 (+- 3.124) Average num. events: 957.400 (+- 0.306)
Average time per event 2.628 usec Average time per event 2.432 usec
Number of synthesis threads: 11 Number of synthesis threads: 11
Average synthesis took: 2299.300 usec (+- 57.597 usec) Average synthesis took: 2340.300 usec (+- 34.638 usec)
Average num. events: 956.000 (+- 0.000) Average num. events: 960.000 (+- 0.000)
Average time per event 2.405 usec Average time per event 2.438 usec
Number of synthesis threads: 12 Number of synthesis threads: 12
Average synthesis took: 2545.500 usec (+- 69.557 usec) Average synthesis took: 2318.700 usec (+- 15.803 usec)
Average num. events: 974.800 (+- 0.611) Average num. events: 963.800 (+- 0.200)
Average time per event 2.611 usec Average time per event 2.406 usec
Number of synthesis threads: 13 Number of synthesis threads: 13
Average synthesis took: 2386.400 usec (+- 79.244 usec) Average synthesis took: 2408.700 usec (+- 27.071 usec)
Average num. events: 950.500 (+- 5.726) Average num. events: 966.000 (+- 0.000)
Average time per event 2.511 usec Average time per event 2.493 usec
Number of synthesis threads: 14 Number of synthesis threads: 14
Average synthesis took: 2466.600 usec (+- 57.893 usec) Average synthesis took: 2547.200 usec (+- 53.445 usec)
Average num. events: 957.600 (+- 0.718) Average num. events: 968.000 (+- 0.000)
Average time per event 2.576 usec Average time per event 2.631 usec
Number of synthesis threads: 15 Number of synthesis threads: 15
Average synthesis took: 2249.700 usec (+- 64.026 usec) Average synthesis took: 2647.900 usec (+- 79.014 usec)
Average num. events: 956.000 (+- 0.000) Average num. events: 970.000 (+- 0.000)
Average time per event 2.353 usec Average time per event 2.730 usec
Number of synthesis threads: 16 Number of synthesis threads: 16
Average synthesis took: 2311.700 usec (+- 64.304 usec) Average synthesis took: 2676.200 usec (+- 34.824 usec)
Average num. events: 955.000 (+- 0.907) Average num. events: 972.000 (+- 0.000)
Average time per event 2.421 usec Average time per event 2.753 usec
Number of synthesis threads: 17 Number of synthesis threads: 17
Average synthesis took: 2174.100 usec (+- 36.673 usec) Average synthesis took: 2580.100 usec (+- 45.414 usec)
Average num. events: 971.600 (+- 3.124) Average num. events: 974.000 (+- 0.000)
Average time per event 2.238 usec Average time per event 2.649 usec
Number of synthesis threads: 18 Number of synthesis threads: 18
Average synthesis took: 2294.200 usec (+- 63.657 usec) Average synthesis took: 2810.200 usec (+- 49.113 usec)
Average num. events: 953.200 (+- 0.611) Average num. events: 976.000 (+- 0.000)
Average time per event 2.407 usec Average time per event 2.879 usec
Number of synthesis threads: 19 Number of synthesis threads: 19
Average synthesis took: 2410.700 usec (+- 120.169 usec) Average synthesis took: 2862.400 usec (+- 36.982 usec)
Average num. events: 953.400 (+- 0.306) Average num. events: 978.000 (+- 0.000)
Average time per event 2.529 usec Average time per event 2.927 usec
Number of synthesis threads: 20 Number of synthesis threads: 20
Average synthesis took: 2387.000 usec (+- 91.051 usec) Average synthesis took: 2908.800 usec (+- 36.404 usec)
Average num. events: 952.800 (+- 0.800) Average num. events: 978.600 (+- 0.306)
Average time per event 2.505 usec Average time per event 2.972 usec
Number of synthesis threads: 21 Number of synthesis threads: 21
Average synthesis took: 2275.700 usec (+- 39.815 usec) Average synthesis took: 3141.100 usec (+- 30.896 usec)
Average num. events: 954.600 (+- 0.306) Average num. events: 980.000 (+- 0.000)
Average time per event 2.384 usec Average time per event 3.205 usec
Number of synthesis threads: 22 Number of synthesis threads: 22
Average synthesis took: 2373.200 usec (+- 89.528 usec) Average synthesis took: 3342.400 usec (+- 112.115 usec)
Average num. events: 949.100 (+- 5.843) Average num. events: 982.000 (+- 0.000)
Average time per event 2.500 usec Average time per event 3.404 usec
Number of synthesis threads: 23 Number of synthesis threads: 23
Average synthesis took: 2318.300 usec (+- 39.395 usec) Average synthesis took: 3269.700 usec (+- 55.215 usec)
Average num. events: 954.600 (+- 0.427) Average num. events: 984.000 (+- 0.000)
Average time per event 2.429 usec Average time per event 3.323 usec
Number of synthesis threads: 24 Number of synthesis threads: 24
Average synthesis took: 2241.900 usec (+- 52.577 usec) Average synthesis took: 3379.500 usec (+- 56.380 usec)
Average num. events: 954.000 (+- 0.000) Average num. events: 986.000 (+- 0.000)
Average time per event 2.350 usec Average time per event 3.427 usec
Number of synthesis threads: 25 Number of synthesis threads: 25
Average synthesis took: 2343.400 usec (+- 101.611 usec) Average synthesis took: 3382.500 usec (+- 51.535 usec)
Average num. events: 956.200 (+- 1.009) Average num. events: 988.000 (+- 0.000)
Average time per event 2.451 usec Average time per event 3.424 usec
Number of synthesis threads: 26 Number of synthesis threads: 26
Average synthesis took: 2260.700 usec (+- 18.863 usec) Average synthesis took: 3391.600 usec (+- 44.053 usec)
Average num. events: 954.000 (+- 0.000) Average num. events: 990.000 (+- 0.000)
Average time per event 2.370 usec Average time per event 3.426 usec
Number of synthesis threads: 27 Number of synthesis threads: 27
Average synthesis took: 2373.800 usec (+- 74.213 usec) Average synthesis took: 3659.200 usec (+- 113.176 usec)
Average num. events: 955.000 (+- 0.803) Average num. events: 992.000 (+- 0.000)
Average time per event 2.486 usec Average time per event 3.689 usec
Number of synthesis threads: 28 Number of synthesis threads: 28
Average synthesis took: 2335.500 usec (+- 49.480 usec) Average synthesis took: 3625.000 usec (+- 90.131 usec)
Average num. events: 954.000 (+- 0.000) Average num. events: 994.000 (+- 0.000)
Average time per event 2.448 usec Average time per event 3.647 usec
Number of synthesis threads: 29 Number of synthesis threads: 29
Average synthesis took: 2182.100 usec (+- 41.649 usec) Average synthesis took: 3708.400 usec (+- 103.717 usec)
Average num. events: 954.000 (+- 0.000) Average num. events: 996.000 (+- 0.000)
Average time per event 2.287 usec Average time per event 3.723 usec
Number of synthesis threads: 30 Number of synthesis threads: 30
Average synthesis took: 2246.100 usec (+- 58.252 usec) Average synthesis took: 3820.500 usec (+- 95.282 usec)
Average num. events: 954.000 (+- 0.000) Average num. events: 998.000 (+- 0.000)
Average time per event 2.354 usec Average time per event 3.828 usec
Number of synthesis threads: 31 Number of synthesis threads: 31
Average synthesis took: 2156.900 usec (+- 26.141 usec) Average synthesis took: 3881.400 usec (+- 36.277 usec)
Average num. events: 948.300 (+- 5.700) Average num. events: 1000.000 (+- 0.000)
Average time per event 2.274 usec Average time per event 3.881 usec
Number of synthesis threads: 32 Number of synthesis threads: 32
Average synthesis took: 2295.300 usec (+- 41.538 usec) Average synthesis took: 4191.700 usec (+- 149.780 usec)
Average num. events: 954.000 (+- 0.000) Average num. events: 1002.000 (+- 0.000)
Average time per event 2.406 usec Average time per event 4.183 usec
Number of synthesis threads: 33 Number of synthesis threads: 33
Average synthesis took: 2249.100 usec (+- 59.135 usec) Average synthesis took: 3988.200 usec (+- 25.015 usec)
Average num. events: 948.500 (+- 5.726) Average num. events: 1004.000 (+- 0.000)
Average time per event 2.371 usec Average time per event 3.972 usec
Number of synthesis threads: 34 Number of synthesis threads: 34
Average synthesis took: 2270.400 usec (+- 65.011 usec) Average synthesis took: 4064.600 usec (+- 44.158 usec)
Average num. events: 954.200 (+- 0.200) Average num. events: 1006.000 (+- 0.000)
Average time per event 2.379 usec Average time per event 4.040 usec
Number of synthesis threads: 35 Number of synthesis threads: 35
Average synthesis took: 2259.200 usec (+- 44.287 usec) Average synthesis took: 4145.700 usec (+- 37.297 usec)
Average num. events: 954.000 (+- 0.000) Average num. events: 1008.000 (+- 0.000)
Average time per event 2.368 usec Average time per event 4.113 usec
Number of synthesis threads: 36 Number of synthesis threads: 36
Average synthesis took: 2294.100 usec (+- 38.693 usec) Average synthesis took: 4234.900 usec (+- 81.904 usec)
Average num. events: 954.000 (+- 0.000) Average num. events: 1010.400 (+- 0.267)
Average time per event 2.405 usec Average time per event 4.191 usec
Number of synthesis threads: 37 Number of synthesis threads: 37
Average synthesis took: 2338.900 usec (+- 80.346 usec) Average synthesis took: 4337.900 usec (+- 30.071 usec)
Average num. events: 954.400 (+- 0.267) Average num. events: 1014.000 (+- 0.000)
Average time per event 2.451 usec Average time per event 4.278 usec
Number of synthesis threads: 38 Number of synthesis threads: 38
Average synthesis took: 2406.300 usec (+- 57.140 usec) Average synthesis took: 4426.600 usec (+- 27.035 usec)
Average num. events: 938.400 (+- 7.730) Average num. events: 1016.000 (+- 0.000)
Average time per event 2.564 usec Average time per event 4.357 usec
Number of synthesis threads: 39 Number of synthesis threads: 39
Average synthesis took: 2371.000 usec (+- 35.676 usec) Average synthesis took: 5979.000 usec (+- 1518.855 usec)
Average num. events: 963.000 (+- 0.000) Average num. events: 1018.000 (+- 0.000)
Average time per event 2.462 usec Average time per event 5.873 usec
Number of synthesis threads: 40 Number of synthesis threads: 40
Average synthesis took: 2489.400 usec (+- 49.832 usec) Average synthesis took: 4576.500 usec (+- 75.278 usec)
Average num. events: 956.800 (+- 6.721) Average num. events: 1020.000 (+- 0.000)
Average time per event 2.602 usec Average time per event 4.487 usec