2020-04-20 20:59:41

by Alexey Dobriyan

[permalink] [raw]
Subject: [PATCH 01/15] sched: make nr_running() return "unsigned int"

I don't anyone have been crazy enough to spawn 2^32 threads.
It'd require absurd amounts of physical memory, and bump into futex pid
limit anyway.

Meanwhile save few bits on REX prefixes and some stack space for upcoming
print_integer() stuff.

And remove "extern" from prototypes while I'm at it.

Signed-off-by: Alexey Dobriyan <[email protected]>
---
fs/proc/loadavg.c | 2 +-
fs/proc/stat.c | 2 +-
include/linux/sched/stat.h | 2 +-
kernel/sched/core.c | 4 ++--
4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index 8468baee951d..f32878d9a39f 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -16,7 +16,7 @@ static int loadavg_proc_show(struct seq_file *m, void *v)

get_avenrun(avnrun, FIXED_1/200, 0);

- seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n",
+ seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %u/%d %d\n",
LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 46b3293015fe..93ce344f62a5 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -197,7 +197,7 @@ static int show_stat(struct seq_file *p, void *v)
"\nctxt %llu\n"
"btime %llu\n"
"processes %lu\n"
- "procs_running %lu\n"
+ "procs_running %u\n"
"procs_blocked %lu\n",
nr_context_switches(),
(unsigned long long)boottime.tv_sec,
diff --git a/include/linux/sched/stat.h b/include/linux/sched/stat.h
index 568286411b43..f3b86515bafe 100644
--- a/include/linux/sched/stat.h
+++ b/include/linux/sched/stat.h
@@ -16,7 +16,7 @@ extern unsigned long total_forks;
extern int nr_threads;
DECLARE_PER_CPU(unsigned long, process_counts);
extern int nr_processes(void);
-extern unsigned long nr_running(void);
+unsigned int nr_running(void);
extern bool single_task_running(void);
extern unsigned long nr_iowait(void);
extern unsigned long nr_iowait_cpu(int cpu);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3a61a3b8eaa9..d9bae602966c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3381,9 +3381,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
* externally visible scheduler statistics: current number of runnable
* threads, total number of context switches performed since bootup.
*/
-unsigned long nr_running(void)
+unsigned int nr_running(void)
{
- unsigned long i, sum = 0;
+ unsigned int i, sum = 0;

for_each_online_cpu(i)
sum += cpu_rq(i)->nr_running;
--
2.24.1


2020-04-20 20:59:44

by Alexey Dobriyan

[permalink] [raw]
Subject: [PATCH 04/15] print_integer, proc: rewrite /proc/self via print_integer()

Signed-off-by: Alexey Dobriyan <[email protected]>
---
fs/proc/internal.h | 1 +
fs/proc/self.c | 8 ++++++--
2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 917cc85e3466..8abfd2fa0963 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -13,6 +13,7 @@
#include <linux/binfmts.h>
#include <linux/sched/coredump.h>
#include <linux/sched/task.h>
+#include "../../lib/print-integer.h"

struct ctl_table_header;
struct mempolicy;
diff --git a/fs/proc/self.c b/fs/proc/self.c
index 57c0a1047250..c8a1662e2cfd 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -14,15 +14,19 @@ static const char *proc_self_get_link(struct dentry *dentry,
{
struct pid_namespace *ns = proc_pid_ns(inode);
pid_t tgid = task_tgid_nr_ns(current, ns);
+ char buf[10 + 1];
+ char *p = buf + sizeof(buf);
char *name;

if (!tgid)
return ERR_PTR(-ENOENT);
/* max length of unsigned int in decimal + NULL term */
- name = kmalloc(10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC);
+ name = kmalloc(sizeof(buf), dentry ? GFP_KERNEL : GFP_ATOMIC);
if (unlikely(!name))
return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD);
- sprintf(name, "%u", tgid);
+ *--p = '\0';
+ p = _print_integer_u32(p, tgid);
+ memcpy(name, p, buf + sizeof(buf) - p);
set_delayed_call(done, kfree_link, name);
return name;
}
--
2.24.1

2020-04-20 20:59:48

by Alexey Dobriyan

[permalink] [raw]
Subject: [PATCH 07/15] print_integer, proc: rewrite /proc/stat via print_integer()

Signed-off-by: Alexey Dobriyan <[email protected]>
---
fs/proc/stat.c | 136 ++++++++++++++++++++++++++++++-------------------
1 file changed, 83 insertions(+), 53 deletions(-)

diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 678feb7b9949..859dc49cca85 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -14,6 +14,8 @@
#include <linux/sched/cputime.h>
#include <linux/tick.h>

+#include "../../lib/print-integer.h"
+
#ifndef arch_irq_stat_cpu
#define arch_irq_stat_cpu(cpu) 0
#endif
@@ -104,19 +106,30 @@ static void show_all_irqs(struct seq_file *p)
show_irq_gap(p, nr_irqs - next);
}

+enum {
+ USER,
+ NICE,
+ SYSTEM,
+ IDLE,
+ IOWAIT,
+ IRQ,
+ SOFTIRQ,
+ STEAL,
+ GUEST,
+ GUEST_NICE,
+
+ NR_VAL
+};
+
static int show_stat(struct seq_file *p, void *v)
{
+ u64 val[NR_VAL] = {};
int i, j;
- u64 user, nice, system, idle, iowait, irq, softirq, steal;
- u64 guest, guest_nice;
u64 sum = 0;
u64 sum_softirq = 0;
unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
struct timespec64 boottime;

- user = nice = system = idle = iowait =
- irq = softirq = steal = 0;
- guest = guest_nice = 0;
getboottime64(&boottime);

for_each_possible_cpu(i) {
@@ -125,16 +138,16 @@ static int show_stat(struct seq_file *p, void *v)

kcpustat_cpu_fetch(&kcpustat, i);

- user += cpustat[CPUTIME_USER];
- nice += cpustat[CPUTIME_NICE];
- system += cpustat[CPUTIME_SYSTEM];
- idle += get_idle_time(&kcpustat, i);
- iowait += get_iowait_time(&kcpustat, i);
- irq += cpustat[CPUTIME_IRQ];
- softirq += cpustat[CPUTIME_SOFTIRQ];
- steal += cpustat[CPUTIME_STEAL];
- guest += cpustat[CPUTIME_GUEST];
- guest_nice += cpustat[CPUTIME_GUEST_NICE];
+ val[USER] += cpustat[CPUTIME_USER];
+ val[NICE] += cpustat[CPUTIME_NICE];
+ val[SYSTEM] += cpustat[CPUTIME_SYSTEM];
+ val[IDLE] += get_idle_time(&kcpustat, i);
+ val[IOWAIT] += get_iowait_time(&kcpustat, i);
+ val[IRQ] += cpustat[CPUTIME_IRQ];
+ val[SOFTIRQ] += cpustat[CPUTIME_SOFTIRQ];
+ val[STEAL] += cpustat[CPUTIME_STEAL];
+ val[GUEST] += cpustat[CPUTIME_GUEST];
+ val[GUEST_NICE] += cpustat[CPUTIME_GUEST_NICE];
sum += kstat_cpu_irqs_sum(i);
sum += arch_irq_stat_cpu(i);

@@ -147,47 +160,55 @@ static int show_stat(struct seq_file *p, void *v)
}
sum += arch_irq_stat();

- seq_put_decimal_ull(p, "cpu ", nsec_to_clock_t(user));
- seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice));
- seq_put_decimal_ull(p, " ", nsec_to_clock_t(system));
- seq_put_decimal_ull(p, " ", nsec_to_clock_t(idle));
- seq_put_decimal_ull(p, " ", nsec_to_clock_t(iowait));
- seq_put_decimal_ull(p, " ", nsec_to_clock_t(irq));
- seq_put_decimal_ull(p, " ", nsec_to_clock_t(softirq));
- seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal));
- seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest));
- seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice));
- seq_putc(p, '\n');
+ {
+ char buf[4 + NR_VAL * (1 + 20) + 1];
+ char *q = buf + sizeof(buf);
+
+ *--q = '\n';
+ for (i = NR_VAL - 1; i >= 0; i--) {
+ q = _print_integer_u64(q, nsec_to_clock_t(val[i]));
+ *--q = ' ';
+ }
+ q = memcpy(q - 4, "cpu ", 4);
+
+ seq_write(p, q, buf + sizeof(buf) - q);
+ }

for_each_online_cpu(i) {
struct kernel_cpustat kcpustat;
u64 *cpustat = kcpustat.cpustat;
+ char buf[3 + 10 + NR_VAL * (1 + 20) + 1];
+ char *q = buf + sizeof(buf);

kcpustat_cpu_fetch(&kcpustat, i);

- /* Copy values here to work around gcc-2.95.3, gcc-2.96 */
- user = cpustat[CPUTIME_USER];
- nice = cpustat[CPUTIME_NICE];
- system = cpustat[CPUTIME_SYSTEM];
- idle = get_idle_time(&kcpustat, i);
- iowait = get_iowait_time(&kcpustat, i);
- irq = cpustat[CPUTIME_IRQ];
- softirq = cpustat[CPUTIME_SOFTIRQ];
- steal = cpustat[CPUTIME_STEAL];
- guest = cpustat[CPUTIME_GUEST];
- guest_nice = cpustat[CPUTIME_GUEST_NICE];
- seq_printf(p, "cpu%d", i);
- seq_put_decimal_ull(p, " ", nsec_to_clock_t(user));
- seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice));
- seq_put_decimal_ull(p, " ", nsec_to_clock_t(system));
- seq_put_decimal_ull(p, " ", nsec_to_clock_t(idle));
- seq_put_decimal_ull(p, " ", nsec_to_clock_t(iowait));
- seq_put_decimal_ull(p, " ", nsec_to_clock_t(irq));
- seq_put_decimal_ull(p, " ", nsec_to_clock_t(softirq));
- seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal));
- seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest));
- seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice));
- seq_putc(p, '\n');
+ *--q = '\n';
+ q = _print_integer_u64(q, nsec_to_clock_t(cpustat[CPUTIME_GUEST_NICE]));
+ *--q = ' ';
+ q = _print_integer_u64(q, nsec_to_clock_t(cpustat[CPUTIME_GUEST]));
+ *--q = ' ';
+ q = _print_integer_u64(q, nsec_to_clock_t(cpustat[CPUTIME_STEAL]));
+ *--q = ' ';
+ q = _print_integer_u64(q, nsec_to_clock_t(cpustat[CPUTIME_SOFTIRQ]));
+ *--q = ' ';
+ q = _print_integer_u64(q, nsec_to_clock_t(cpustat[CPUTIME_IRQ]));
+ *--q = ' ';
+ q = _print_integer_u64(q, nsec_to_clock_t(cpustat[CPUTIME_IRQ]));
+ *--q = ' ';
+ q = _print_integer_u64(q, nsec_to_clock_t(get_iowait_time(&kcpustat, i)));
+ *--q = ' ';
+ q = _print_integer_u64(q, nsec_to_clock_t(get_idle_time(&kcpustat, i)));
+ *--q = ' ';
+ q = _print_integer_u64(q, nsec_to_clock_t(cpustat[CPUTIME_SYSTEM]));
+ *--q = ' ';
+ q = _print_integer_u64(q, nsec_to_clock_t(cpustat[CPUTIME_NICE]));
+ *--q = ' ';
+ q = _print_integer_u64(q, nsec_to_clock_t(cpustat[CPUTIME_USER]));
+ *--q = ' ';
+ q = _print_integer_u32(q, i);
+ q = memcpy(q - 3, "cpu", 3);
+
+ seq_write(p, q, buf + sizeof(buf) - q);
}
seq_put_decimal_ull(p, "intr ", (unsigned long long)sum);

@@ -205,11 +226,20 @@ static int show_stat(struct seq_file *p, void *v)
nr_running(),
nr_iowait());

- seq_put_decimal_ull(p, "softirq ", (unsigned long long)sum_softirq);
+ {
+ char buf[8 + 20 + (1 + 10) * NR_SOFTIRQS + 1];
+ char *q = buf + sizeof(buf);

- for (i = 0; i < NR_SOFTIRQS; i++)
- seq_put_decimal_ull(p, " ", per_softirq_sums[i]);
- seq_putc(p, '\n');
+ *--q = '\n';
+ for (i = NR_SOFTIRQS - 1; i >= 0; i--) {
+ q = _print_integer_u32(q, per_softirq_sums[i]);
+ *--q = ' ';
+ }
+ q = _print_integer_u64(q, sum_softirq);
+ q = memcpy(q - 8, "softirq ", 8);
+
+ seq_write(p, q, buf + sizeof(buf) - q);
+ }

return 0;
}
--
2.24.1

2020-04-20 21:00:00

by Alexey Dobriyan

[permalink] [raw]
Subject: [PATCH 10/15] print_integer, proc: rewrite /proc/*/fd via print_integer()

Signed-off-by: Alexey Dobriyan <[email protected]>
---
fs/proc/fd.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index e098302b5101..059a3404c785 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -247,8 +247,8 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
fd++, ctx->pos++) {
struct file *f;
struct fd_data data;
- char name[10 + 1];
- unsigned int len;
+ char buf[10];
+ char *p = buf + sizeof(buf);

f = fcheck_files(files, fd);
if (!f)
@@ -257,9 +257,9 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
rcu_read_unlock();
data.fd = fd;

- len = snprintf(name, sizeof(name), "%u", fd);
+ p = _print_integer_u32(p, fd);
if (!proc_fill_cache(file, ctx,
- name, len, instantiate, tsk,
+ p, buf + sizeof(buf) - p, instantiate, tsk,
&data))
goto out_fd_loop;
cond_resched();
--
2.24.1

2020-04-20 21:00:06

by Alexey Dobriyan

[permalink] [raw]
Subject: [PATCH 15/15] print_integer, proc: rewrite /proc/meminfo via print_integer()

This actually makes /proc/meminfo slower, I need to check what's
going on.

Signed-off-by: Alexey Dobriyan <[email protected]>
---
fs/proc/meminfo.c | 18 +++++++++++++++---
1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 8c1f1bb1a5ce..6dff2298cc3f 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -24,12 +24,24 @@ void __attribute__((weak)) arch_report_meminfo(struct seq_file *m)
{
}

-static void show_val_kb(struct seq_file *m, const char *s, unsigned long num)
+static void _show_val_kb(struct seq_file *m, const char *s, size_t len,
+ unsigned long num)
{
- seq_put_decimal_ull_width(m, s, num << (PAGE_SHIFT - 10), 8);
- seq_write(m, " kB\n", 4);
+ char buf[64];
+ char *p = buf + sizeof(buf);
+ char *tmp;
+
+ p = memcpy(p - 4, " kB\n", 4);
+ tmp = memcpy(p - 8, " ", 8);
+ p = _print_integer_ul(p, num << (PAGE_SHIFT - 10));
+ p = min(p, tmp);
+ p = memcpy(p - len, s, len);
+
+ seq_write(m, p, buf + sizeof(buf) - p);
}

+#define show_val_kb(m, s, n) _show_val_kb((m), (s), strlen(s), (n))
+
static int meminfo_proc_show(struct seq_file *m, void *v)
{
struct sysinfo i;
--
2.24.1

2020-04-20 21:00:16

by Alexey Dobriyan

[permalink] [raw]
Subject: [PATCH 12/15] print_integer, proc: rewrite /proc/*/statm via print_integer()

Signed-off-by: Alexey Dobriyan <[email protected]>
---
fs/proc/array.c | 32 ++++++++++++++++++--------------
1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 6986f9f68ab7..16d66538fa61 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -695,24 +695,28 @@ int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
unsigned long shared = 0;
unsigned long text = 0;
unsigned long data = 0;
+ char buf[5 * LEN_UL + 9];
+ char *p = buf + sizeof(buf);

size = task_statm(mm, &shared, &text, &data, &resident);
mmput(mm);

- /*
- * For quick read, open code by putting numbers directly
- * expected format is
- * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
- * size, resident, shared, text, data);
- */
- seq_put_decimal_ull(m, "", size);
- seq_put_decimal_ull(m, " ", resident);
- seq_put_decimal_ull(m, " ", shared);
- seq_put_decimal_ull(m, " ", text);
- seq_put_decimal_ull(m, " ", 0);
- seq_put_decimal_ull(m, " ", data);
- seq_put_decimal_ull(m, " ", 0);
- seq_putc(m, '\n');
+ *--p = '\n';
+ *--p = '0';
+ *--p = ' ';
+ p = _print_integer_ul(p, data);
+ *--p = ' ';
+ *--p = '0';
+ *--p = ' ';
+ p = _print_integer_ul(p, text);
+ *--p = ' ';
+ p = _print_integer_ul(p, shared);
+ *--p = ' ';
+ p = _print_integer_ul(p, resident);
+ *--p = ' ';
+ p = _print_integer_ul(p, size);
+
+ seq_write(m, p, buf + sizeof(buf) - p);
} else {
seq_write(m, "0 0 0 0 0 0 0\n", 14);
}
--
2.24.1

2020-04-20 21:00:19

by Alexey Dobriyan

[permalink] [raw]
Subject: [PATCH 11/15] print_integer, proc: rewrite /proc/*/stat via print_integer()

Signed-off-by: Alexey Dobriyan <[email protected]>
---
fs/proc/array.c | 192 ++++++++++++++++++++++++++++++------------------
1 file changed, 122 insertions(+), 70 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 8e16f14bb05a..6986f9f68ab7 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -446,6 +446,9 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
u64 cgtime, gtime;
unsigned long rsslim = 0;
unsigned long flags;
+ char buf[11 * (1 + LEN_U32) + 7 * (1 + LEN_S32) + 19 * (1 + LEN_UL) +
+ 8 * (1 + LEN_U64) + 12];
+ char *p;

state = *get_task_state(task);
vsize = eip = esp = 0;
@@ -535,47 +538,54 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
/* convert nsec -> ticks */
start_time = nsec_to_clock_t(task->start_boottime);

- seq_put_decimal_ull(m, "", pid_nr_ns(pid, ns));
- seq_puts(m, " (");
+ p = buf + sizeof(buf);
+ *--p = '(';
+ *--p = ' ';
+ p = _print_integer_u32(p, pid_nr_ns(pid, ns));
+ seq_write(m, p, buf + sizeof(buf) - p);
+
proc_task_name(m, task, false);
- seq_puts(m, ") ");
- seq_putc(m, state);
- seq_put_decimal_ll(m, " ", ppid);
- seq_put_decimal_ll(m, " ", pgid);
- seq_put_decimal_ll(m, " ", sid);
- seq_put_decimal_ll(m, " ", tty_nr);
- seq_put_decimal_ll(m, " ", tty_pgrp);
- seq_put_decimal_ull(m, " ", task->flags);
- seq_put_decimal_ull(m, " ", min_flt);
- seq_put_decimal_ull(m, " ", cmin_flt);
- seq_put_decimal_ull(m, " ", maj_flt);
- seq_put_decimal_ull(m, " ", cmaj_flt);
- seq_put_decimal_ull(m, " ", nsec_to_clock_t(utime));
- seq_put_decimal_ull(m, " ", nsec_to_clock_t(stime));
- seq_put_decimal_ll(m, " ", nsec_to_clock_t(cutime));
- seq_put_decimal_ll(m, " ", nsec_to_clock_t(cstime));
- seq_put_decimal_ll(m, " ", priority);
- seq_put_decimal_ll(m, " ", nice);
- seq_put_decimal_ll(m, " ", num_threads);
- seq_put_decimal_ull(m, " ", 0);
- seq_put_decimal_ull(m, " ", start_time);
- seq_put_decimal_ull(m, " ", vsize);
- seq_put_decimal_ull(m, " ", mm ? get_mm_rss(mm) : 0);
- seq_put_decimal_ull(m, " ", rsslim);
- seq_put_decimal_ull(m, " ", mm ? (permitted ? mm->start_code : 1) : 0);
- seq_put_decimal_ull(m, " ", mm ? (permitted ? mm->end_code : 1) : 0);
- seq_put_decimal_ull(m, " ", (permitted && mm) ? mm->start_stack : 0);
- seq_put_decimal_ull(m, " ", esp);
- seq_put_decimal_ull(m, " ", eip);
- /* The signal information here is obsolete.
- * It must be decimal for Linux 2.0 compatibility.
- * Use /proc/#/status for real-time signals.
- */
- seq_put_decimal_ull(m, " ", task->pending.signal.sig[0] & 0x7fffffffUL);
- seq_put_decimal_ull(m, " ", task->blocked.sig[0] & 0x7fffffffUL);
- seq_put_decimal_ull(m, " ", sigign.sig[0] & 0x7fffffffUL);
- seq_put_decimal_ull(m, " ", sigcatch.sig[0] & 0x7fffffffUL);

+ p = buf + sizeof(buf);
+ *--p = '\n';
+ p = _print_integer_s32(p, permitted ? task->exit_code : 0);
+ *--p = ' ';
+ if (mm && permitted) {
+ p = _print_integer_ul(p, mm->env_end);
+ *--p = ' ';
+ p = _print_integer_ul(p, mm->env_start);
+ *--p = ' ';
+ p = _print_integer_ul(p, mm->arg_end);
+ *--p = ' ';
+ p = _print_integer_ul(p, mm->arg_start);
+ *--p = ' ';
+ p = _print_integer_ul(p, mm->start_brk);
+ *--p = ' ';
+ p = _print_integer_ul(p, mm->end_data);
+ *--p = ' ';
+ p = _print_integer_ul(p, mm->start_data);
+ *--p = ' ';
+ } else {
+ p = memcpy(p - 14, " 0 0 0 0 0 0 0", 14);
+ }
+ p = _print_integer_u64(p, nsec_to_clock_t(cgtime));
+ *--p = ' ';
+ p = _print_integer_u64(p, nsec_to_clock_t(gtime));
+ *--p = ' ';
+ p = _print_integer_u64(p, delayacct_blkio_ticks(task));
+ *--p = ' ';
+ p = _print_integer_u32(p, task->policy);
+ *--p = ' ';
+ p = _print_integer_u32(p, task->rt_priority);
+ *--p = ' ';
+ p = _print_integer_u32(p, task_cpu(task));
+ *--p = ' ';
+ p = _print_integer_s32(p, task->exit_signal);
+ *--p = ' ';
+ *--p = '0';
+ *--p = ' ';
+ *--p = '0';
+ *--p = ' ';
/*
* We used to output the absolute kernel address, but that's an
* information leak - so instead we show a 0/1 flag here, to signal
@@ -583,38 +593,80 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
*
* This works with older implementations of procps as well.
*/
- if (wchan)
- seq_puts(m, " 1");
- else
- seq_puts(m, " 0");
-
- seq_put_decimal_ull(m, " ", 0);
- seq_put_decimal_ull(m, " ", 0);
- seq_put_decimal_ll(m, " ", task->exit_signal);
- seq_put_decimal_ll(m, " ", task_cpu(task));
- seq_put_decimal_ull(m, " ", task->rt_priority);
- seq_put_decimal_ull(m, " ", task->policy);
- seq_put_decimal_ull(m, " ", delayacct_blkio_ticks(task));
- seq_put_decimal_ull(m, " ", nsec_to_clock_t(gtime));
- seq_put_decimal_ll(m, " ", nsec_to_clock_t(cgtime));
-
- if (mm && permitted) {
- seq_put_decimal_ull(m, " ", mm->start_data);
- seq_put_decimal_ull(m, " ", mm->end_data);
- seq_put_decimal_ull(m, " ", mm->start_brk);
- seq_put_decimal_ull(m, " ", mm->arg_start);
- seq_put_decimal_ull(m, " ", mm->arg_end);
- seq_put_decimal_ull(m, " ", mm->env_start);
- seq_put_decimal_ull(m, " ", mm->env_end);
- } else
- seq_puts(m, " 0 0 0 0 0 0 0");
-
- if (permitted)
- seq_put_decimal_ll(m, " ", task->exit_code);
- else
- seq_puts(m, " 0");
+ *--p = '0' + !!wchan;
+ *--p = ' ';
+ /*
+ * The signal information here is obsolete.
+ * It must be decimal for Linux 2.0 compatibility.
+ * Use /proc/#/status for real-time signals.
+ */
+ p = _print_integer_u32(p, sigcatch.sig[0] & 0x7fffffff);
+ *--p = ' ';
+ p = _print_integer_u32(p, sigign.sig[0] & 0x7fffffff);
+ *--p = ' ';
+ p = _print_integer_u32(p, task->blocked.sig[0] & 0x7fffffff);
+ *--p = ' ';
+ p = _print_integer_u32(p, task->pending.signal.sig[0] & 0x7fffffff);
+ *--p = ' ';
+ p = _print_integer_ul(p, eip);
+ *--p = ' ';
+ p = _print_integer_ul(p, esp);
+ *--p = ' ';
+ p = _print_integer_ul(p, (permitted && mm) ? mm->start_stack : 0);
+ *--p = ' ';
+ p = _print_integer_ul(p, mm ? (permitted ? mm->end_code : 1) : 0);
+ *--p = ' ';
+ p = _print_integer_ul(p, mm ? (permitted ? mm->start_code : 1) : 0);
+ *--p = ' ';
+ p = _print_integer_ul(p, rsslim);
+ *--p = ' ';
+ p = _print_integer_ul(p, mm ? get_mm_rss(mm) : 0);
+ *--p = ' ';
+ p = _print_integer_ul(p, vsize);
+ *--p = ' ';
+ p = _print_integer_u64(p, start_time);
+ *--p = ' ';
+ *--p = '0';
+ *--p = ' ';
+ p = _print_integer_u32(p, num_threads);
+ *--p = ' ';
+ p = _print_integer_s32(p, nice);
+ *--p = ' ';
+ p = _print_integer_s32(p, priority);
+ *--p = ' ';
+ p = _print_integer_u64(p, nsec_to_clock_t(cstime));
+ *--p = ' ';
+ p = _print_integer_u64(p, nsec_to_clock_t(cutime));
+ *--p = ' ';
+ p = _print_integer_u64(p, nsec_to_clock_t(stime));
+ *--p = ' ';
+ p = _print_integer_u64(p, nsec_to_clock_t(utime));
+ *--p = ' ';
+ p = _print_integer_ul(p, cmaj_flt);
+ *--p = ' ';
+ p = _print_integer_ul(p, maj_flt);
+ *--p = ' ';
+ p = _print_integer_ul(p, cmin_flt);
+ *--p = ' ';
+ p = _print_integer_ul(p, min_flt);
+ *--p = ' ';
+ p = _print_integer_u32(p, task->flags);
+ *--p = ' ';
+ p = _print_integer_s32(p, tty_pgrp);
+ *--p = ' ';
+ p = _print_integer_u32(p, tty_nr);
+ *--p = ' ';
+ p = _print_integer_s32(p, sid);
+ *--p = ' ';
+ p = _print_integer_s32(p, pgid);
+ *--p = ' ';
+ p = _print_integer_u32(p, ppid);
+ *--p = ' ';
+ *--p = state;
+ *--p = ' ';
+ *--p = ')';
+ seq_write(m, p, buf + sizeof(buf) - p);

- seq_putc(m, '\n');
if (mm)
mmput(mm);
return 0;
--
2.24.1

2020-04-20 21:00:26

by Alexey Dobriyan

[permalink] [raw]
Subject: [PATCH 13/15] print_integer, printf: rewrite num_to_str() via print_integer()

In my opinion, num_to_str() is more understandable this way.

Signed-off-by: Alexey Dobriyan <[email protected]>
---
lib/vsprintf.c | 30 +++++++++++-------------------
1 file changed, 11 insertions(+), 19 deletions(-)

diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 7c488a1ce318..df2b5ce08fe9 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -51,6 +51,7 @@

#include <linux/string_helpers.h>
#include "kstrtox.h"
+#include "print-integer.h"

/**
* simple_strtoull - convert a string to an unsigned long long
@@ -343,33 +344,24 @@ char *put_dec(char *buf, unsigned long long n)
*/
int num_to_str(char *buf, int size, unsigned long long num, unsigned int width)
{
- /* put_dec requires 2-byte alignment of the buffer. */
- char tmp[sizeof(num) * 3] __aligned(2);
- int idx, len;
+ char tmp[20];
+ char *p = tmp + sizeof(tmp);
+ ptrdiff_t len;

- /* put_dec() may work incorrectly for num = 0 (generate "", not "0") */
- if (num <= 9) {
- tmp[0] = '0' + num;
- len = 1;
- } else {
- len = put_dec(tmp, num) - tmp;
- }
+ p = _print_integer_u64(p, num);
+ len = tmp + sizeof(tmp) - p;

if (len > size || width > size)
return 0;

if (width > len) {
- width = width - len;
- for (idx = 0; idx < width; idx++)
- buf[idx] = ' ';
+ memset(buf, ' ', width - len);
+ memcpy(buf + width - len, p, len);
+ return width;
} else {
- width = 0;
+ memcpy(buf, p, len);
+ return len;
}
-
- for (idx = 0; idx < len; ++idx)
- buf[idx + width] = tmp[len - idx - 1];
-
- return len + width;
}

#define SIGN 1 /* unsigned/signed, must be 1 */
--
2.24.1

2020-04-20 21:02:01

by Alexey Dobriyan

[permalink] [raw]
Subject: [PATCH 14/15] print_integer, printf: rewrite the rest of lib/vsprintf.c via print_integer()

Lookup tables are cheating.

Signed-off-by: Alexey Dobriyan <[email protected]>
---
lib/vsprintf.c | 236 +++++--------------------------------------------
1 file changed, 20 insertions(+), 216 deletions(-)

diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index df2b5ce08fe9..b29fbd0da53f 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -138,204 +138,6 @@ int skip_atoi(const char **s)
return i;
}

-/*
- * Decimal conversion is by far the most typical, and is used for
- * /proc and /sys data. This directly impacts e.g. top performance
- * with many processes running. We optimize it for speed by emitting
- * two characters at a time, using a 200 byte lookup table. This
- * roughly halves the number of multiplications compared to computing
- * the digits one at a time. Implementation strongly inspired by the
- * previous version, which in turn used ideas described at
- * <http://www.cs.uiowa.edu/~jones/bcd/divide.html> (with permission
- * from the author, Douglas W. Jones).
- *
- * It turns out there is precisely one 26 bit fixed-point
- * approximation a of 64/100 for which x/100 == (x * (u64)a) >> 32
- * holds for all x in [0, 10^8-1], namely a = 0x28f5c29. The actual
- * range happens to be somewhat larger (x <= 1073741898), but that's
- * irrelevant for our purpose.
- *
- * For dividing a number in the range [10^4, 10^6-1] by 100, we still
- * need a 32x32->64 bit multiply, so we simply use the same constant.
- *
- * For dividing a number in the range [100, 10^4-1] by 100, there are
- * several options. The simplest is (x * 0x147b) >> 19, which is valid
- * for all x <= 43698.
- */
-
-static const u16 decpair[100] = {
-#define _(x) (__force u16) cpu_to_le16(((x % 10) | ((x / 10) << 8)) + 0x3030)
- _( 0), _( 1), _( 2), _( 3), _( 4), _( 5), _( 6), _( 7), _( 8), _( 9),
- _(10), _(11), _(12), _(13), _(14), _(15), _(16), _(17), _(18), _(19),
- _(20), _(21), _(22), _(23), _(24), _(25), _(26), _(27), _(28), _(29),
- _(30), _(31), _(32), _(33), _(34), _(35), _(36), _(37), _(38), _(39),
- _(40), _(41), _(42), _(43), _(44), _(45), _(46), _(47), _(48), _(49),
- _(50), _(51), _(52), _(53), _(54), _(55), _(56), _(57), _(58), _(59),
- _(60), _(61), _(62), _(63), _(64), _(65), _(66), _(67), _(68), _(69),
- _(70), _(71), _(72), _(73), _(74), _(75), _(76), _(77), _(78), _(79),
- _(80), _(81), _(82), _(83), _(84), _(85), _(86), _(87), _(88), _(89),
- _(90), _(91), _(92), _(93), _(94), _(95), _(96), _(97), _(98), _(99),
-#undef _
-};
-
-/*
- * This will print a single '0' even if r == 0, since we would
- * immediately jump to out_r where two 0s would be written but only
- * one of them accounted for in buf. This is needed by ip4_string
- * below. All other callers pass a non-zero value of r.
-*/
-static noinline_for_stack
-char *put_dec_trunc8(char *buf, unsigned r)
-{
- unsigned q;
-
- /* 1 <= r < 10^8 */
- if (r < 100)
- goto out_r;
-
- /* 100 <= r < 10^8 */
- q = (r * (u64)0x28f5c29) >> 32;
- *((u16 *)buf) = decpair[r - 100*q];
- buf += 2;
-
- /* 1 <= q < 10^6 */
- if (q < 100)
- goto out_q;
-
- /* 100 <= q < 10^6 */
- r = (q * (u64)0x28f5c29) >> 32;
- *((u16 *)buf) = decpair[q - 100*r];
- buf += 2;
-
- /* 1 <= r < 10^4 */
- if (r < 100)
- goto out_r;
-
- /* 100 <= r < 10^4 */
- q = (r * 0x147b) >> 19;
- *((u16 *)buf) = decpair[r - 100*q];
- buf += 2;
-out_q:
- /* 1 <= q < 100 */
- r = q;
-out_r:
- /* 1 <= r < 100 */
- *((u16 *)buf) = decpair[r];
- buf += r < 10 ? 1 : 2;
- return buf;
-}
-
-#if BITS_PER_LONG == 64 && BITS_PER_LONG_LONG == 64
-static noinline_for_stack
-char *put_dec_full8(char *buf, unsigned r)
-{
- unsigned q;
-
- /* 0 <= r < 10^8 */
- q = (r * (u64)0x28f5c29) >> 32;
- *((u16 *)buf) = decpair[r - 100*q];
- buf += 2;
-
- /* 0 <= q < 10^6 */
- r = (q * (u64)0x28f5c29) >> 32;
- *((u16 *)buf) = decpair[q - 100*r];
- buf += 2;
-
- /* 0 <= r < 10^4 */
- q = (r * 0x147b) >> 19;
- *((u16 *)buf) = decpair[r - 100*q];
- buf += 2;
-
- /* 0 <= q < 100 */
- *((u16 *)buf) = decpair[q];
- buf += 2;
- return buf;
-}
-
-static noinline_for_stack
-char *put_dec(char *buf, unsigned long long n)
-{
- if (n >= 100*1000*1000)
- buf = put_dec_full8(buf, do_div(n, 100*1000*1000));
- /* 1 <= n <= 1.6e11 */
- if (n >= 100*1000*1000)
- buf = put_dec_full8(buf, do_div(n, 100*1000*1000));
- /* 1 <= n < 1e8 */
- return put_dec_trunc8(buf, n);
-}
-
-#elif BITS_PER_LONG == 32 && BITS_PER_LONG_LONG == 64
-
-static void
-put_dec_full4(char *buf, unsigned r)
-{
- unsigned q;
-
- /* 0 <= r < 10^4 */
- q = (r * 0x147b) >> 19;
- *((u16 *)buf) = decpair[r - 100*q];
- buf += 2;
- /* 0 <= q < 100 */
- *((u16 *)buf) = decpair[q];
-}
-
-/*
- * Call put_dec_full4 on x % 10000, return x / 10000.
- * The approximation x/10000 == (x * 0x346DC5D7) >> 43
- * holds for all x < 1,128,869,999. The largest value this
- * helper will ever be asked to convert is 1,125,520,955.
- * (second call in the put_dec code, assuming n is all-ones).
- */
-static noinline_for_stack
-unsigned put_dec_helper4(char *buf, unsigned x)
-{
- uint32_t q = (x * (uint64_t)0x346DC5D7) >> 43;
-
- put_dec_full4(buf, x - q * 10000);
- return q;
-}
-
-/* Based on code by Douglas W. Jones found at
- * <http://www.cs.uiowa.edu/~jones/bcd/decimal.html#sixtyfour>
- * (with permission from the author).
- * Performs no 64-bit division and hence should be fast on 32-bit machines.
- */
-static
-char *put_dec(char *buf, unsigned long long n)
-{
- uint32_t d3, d2, d1, q, h;
-
- if (n < 100*1000*1000)
- return put_dec_trunc8(buf, n);
-
- d1 = ((uint32_t)n >> 16); /* implicit "& 0xffff" */
- h = (n >> 32);
- d2 = (h ) & 0xffff;
- d3 = (h >> 16); /* implicit "& 0xffff" */
-
- /* n = 2^48 d3 + 2^32 d2 + 2^16 d1 + d0
- = 281_4749_7671_0656 d3 + 42_9496_7296 d2 + 6_5536 d1 + d0 */
- q = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff);
- q = put_dec_helper4(buf, q);
-
- q += 7671 * d3 + 9496 * d2 + 6 * d1;
- q = put_dec_helper4(buf+4, q);
-
- q += 4749 * d3 + 42 * d2;
- q = put_dec_helper4(buf+8, q);
-
- q += 281 * d3;
- buf += 12;
- if (q)
- buf = put_dec_trunc8(buf, q);
- else while (buf[-1] == '0')
- --buf;
-
- return buf;
-}
-
-#endif
-
/*
* Convert passed number to decimal string.
* Returns the length of string. On buffer overflow, returns 0.
@@ -410,8 +212,6 @@ static noinline_for_stack
char *number(char *buf, char *end, unsigned long long num,
struct printf_spec spec)
{
- /* put_dec requires 2-byte alignment of the buffer. */
- char tmp[3 * sizeof(num)] __aligned(2);
char sign;
char locase;
int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10);
@@ -419,6 +219,8 @@ char *number(char *buf, char *end, unsigned long long num,
bool is_zero = num == 0LL;
int field_width = spec.field_width;
int precision = spec.precision;
+ char tmp[22];
+ char *p = tmp + sizeof(tmp);

/* locase = 0 or 0x20. ORing digits or letters with 'locase'
* produces same digits or (maybe lowercased) letters */
@@ -446,10 +248,9 @@ char *number(char *buf, char *end, unsigned long long num,
field_width--;
}

- /* generate full string in tmp[], in reverse order */
- i = 0;
+ /* generate full string in tmp[] */
if (num < spec.base)
- tmp[i++] = hex_asc_upper[num] | locase;
+ *--p = hex_asc_upper[num] | locase;
else if (spec.base != 10) { /* 8 or 16 */
int mask = spec.base - 1;
int shift = 3;
@@ -457,12 +258,13 @@ char *number(char *buf, char *end, unsigned long long num,
if (spec.base == 16)
shift = 4;
do {
- tmp[i++] = (hex_asc_upper[((unsigned char)num) & mask] | locase);
+ *--p = hex_asc_upper[((unsigned char)num) & mask] | locase;
num >>= shift;
} while (num);
} else { /* base 10 */
- i = put_dec(tmp, num) - tmp;
+ p = _print_integer_u64(p, num);
}
+ i = tmp + sizeof(tmp) - p;

/* printing 100 using %2d gives "100", not "00" */
if (i > precision)
@@ -512,10 +314,11 @@ char *number(char *buf, char *end, unsigned long long num,
++buf;
}
/* actual digits of result */
- while (--i >= 0) {
+ while (p != tmp + sizeof(tmp)) {
if (buf < end)
- *buf = tmp[i];
+ *buf = *p;
++buf;
+ p++;
}
/* trailing space padding */
while (--field_width >= 0) {
@@ -1297,17 +1100,18 @@ char *ip4_string(char *p, const u8 *addr, const char *fmt)
break;
}
for (i = 0; i < 4; i++) {
- char temp[4] __aligned(2); /* hold each IP quad in reverse order */
- int digits = put_dec_trunc8(temp, addr[index]) - temp;
+ char tmp[3];
+ char *q = tmp + sizeof(tmp);
+
+ tmp[0] = tmp[1] = '0';
+ q = _print_integer_u32(q, addr[index]);
if (leading_zeros) {
- if (digits < 3)
- *p++ = '0';
- if (digits < 2)
- *p++ = '0';
+ q = tmp;
}
- /* reverse the digits in the quad */
- while (digits--)
- *p++ = temp[digits];
+ do {
+ *p++ = *q++;
+ } while (q != tmp + sizeof(tmp));
+
if (i < 3)
*p++ = '.';
index += step;
--
2.24.1

2020-04-20 21:02:04

by Alexey Dobriyan

[permalink] [raw]
Subject: [PATCH 03/15] print_integer: new and improved way of printing integers

Time honored way to print integers via vsnprintf() or equivalent has
unavoidable slowdown of parsing format string. This can't be fixed in C,
without introducing external preprocessor.

seq_put_decimal_ull() partially saves the day, but there are a lot of
branches inside and overcopying still.

_print_integer_*() family of functions is meant to make printing
integers as fast as possible by deleting format string parsing and doing
as little work as possible.

It is based on the following observations:

1) memcpy is done in forward direction
it can be done backwards but nobody does that,

2) digits can be extracted in a very simple loop which costs only
1 multiplication and shift (division by constant is not division)

All the above asks for the following signature, semantics and pattern of
printing out beloved /proc files:

/* seq_printf(seq, "%u %llu\n", A, b); */

char buf[10 + 1 + 20 + 1];
char *p = buf + sizeof(buf);

*--p = '\n';
p = _print_integer_u64(p, B);
*--p = ' ';
p = _print_integer_u32(p, A);

seq_write(seq, p, buf + sizeof(buf) - p);

1) stack buffer capable of holding the biggest string is allocated.

2) "p" is pointer to start of the string. Initially it points past
the end of the buffer WHICH IS NOT NUL-TERMINATED!

3) _print_integer_*() actually prints an integer from right to left
and returns new start of the string.

<--------|
123
^
|
+-- p

4) 1 character is printed with

*--p = 'x';

It generates very efficient code as multiple writes can be
merged.

5) fixed string is printed with

p = memcpy(p - 3, "foo", 3);

Complers know what memcpy() does and write-combine it.
4/8-byte writes become 1 instruction and are very efficient.

6) Once everything is printed, the result is written to seq_file buffer.
It does only one overflow check and 1 copy.

This generates very efficient code (and small!).

In regular seq_printf() calls, first argument and format string are
constantly reloaded. Format string will most likely with [rip+...] which
is quite verbose.

seq_put_decimal_ull() will do branches (and even more branches
with "width" argument)

TODO
benchmark with mainline because nouveau is broken for me -(
vsnprintf() changes make the code slower

Signed-off-by: Alexey Dobriyan <[email protected]>
---
MAINTAINERS | 6 ++++++
lib/Makefile | 2 +-
lib/print-integer.c | 40 ++++++++++++++++++++++++++++++++++++++++
lib/print-integer.h | 20 ++++++++++++++++++++
4 files changed, 67 insertions(+), 1 deletion(-)
create mode 100644 lib/print-integer.c
create mode 100644 lib/print-integer.h

diff --git a/MAINTAINERS b/MAINTAINERS
index b816a453b10e..8322125bb929 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8470,6 +8470,12 @@ L: [email protected]
S: Maintained
F: drivers/crypto/inside-secure/

+INTEGER PRINTING PRESS
+M: Alexey Dobriyan <[email protected]>
+L: [email protected]
+F: lib/print-integer.[ch]
+S: Maintained
+
INTEGRITY MEASUREMENT ARCHITECTURE (IMA)
M: Mimi Zohar <[email protected]>
M: Dmitry Kasatkin <[email protected]>
diff --git a/lib/Makefile b/lib/Makefile
index 685aee60de1d..a2f011fa6739 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -25,7 +25,7 @@ KASAN_SANITIZE_string.o := n
CFLAGS_string.o := $(call cc-option, -fno-stack-protector)
endif

-lib-y := ctype.o string.o vsprintf.o cmdline.o \
+lib-y := ctype.o string.o print-integer.o vsprintf.o cmdline.o \
rbtree.o radix-tree.o timerqueue.o xarray.o \
idr.o extable.o sha1.o irq_regs.o argv_split.o \
flex_proportions.o ratelimit.o show_mem.o \
diff --git a/lib/print-integer.c b/lib/print-integer.c
new file mode 100644
index 000000000000..563aaca19b8c
--- /dev/null
+++ b/lib/print-integer.c
@@ -0,0 +1,40 @@
+#include <linux/compiler.h>
+#include <linux/math64.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include "print-integer.h"
+
+noinline
+char *_print_integer_u32(char *p, u32 x)
+{
+ do {
+ *--p = '0' + (x % 10);
+ } while (x /= 10);
+ return p;
+}
+
+noinline
+char *_print_integer_s32(char *p, s32 x)
+{
+ if (x < 0) {
+ p = _print_integer_u32(p, -x);
+ *--p = '-';
+ return p;
+ } else {
+ return _print_integer_u32(p, x);
+ }
+}
+
+noinline
+char *_print_integer_u64(char *p, u64 x)
+{
+ while (x >= 100 * 1000 * 1000) {
+ u32 r;
+
+ x = div_u64_rem(x, 100 * 1000 * 1000, &r);
+ p = memset(p - 8, '0', 8);
+ (void)_print_integer_u32(p + 8, r);
+ }
+ return _print_integer_u32(p, x);
+}
diff --git a/lib/print-integer.h b/lib/print-integer.h
new file mode 100644
index 000000000000..a6f8e1757a6f
--- /dev/null
+++ b/lib/print-integer.h
@@ -0,0 +1,20 @@
+#pragma once
+char *_print_integer_u32(char *, u32);
+char *_print_integer_u64(char *, u64);
+char *_print_integer_s32(char *, s32);
+
+static inline char *_print_integer_ul(char *p, unsigned long x)
+{
+#ifdef CONFIG_64BIT
+ return _print_integer_u64(p, x);
+#else
+ return _print_integer_u32(p, x);
+#endif
+}
+
+enum {
+ LEN_U32 = 10,
+ LEN_S32 = 1 + LEN_U32,
+ LEN_UL = sizeof(long) * 5 / 2,
+ LEN_U64 = 20,
+};
--
2.24.1

2020-04-20 21:02:51

by Alexey Dobriyan

[permalink] [raw]
Subject: [PATCH 09/15] proc: s/p/tsk/

"p" will be used for pointeg to string start, use "tsk" as the best
identifier name.

Signed-off-by: Alexey Dobriyan <[email protected]>
---
fs/proc/fd.c | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 81882a13212d..e098302b5101 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -228,16 +228,16 @@ static struct dentry *proc_lookupfd_common(struct inode *dir,
static int proc_readfd_common(struct file *file, struct dir_context *ctx,
instantiate_t instantiate)
{
- struct task_struct *p = get_proc_task(file_inode(file));
+ struct task_struct *tsk = get_proc_task(file_inode(file));
struct files_struct *files;
unsigned int fd;

- if (!p)
+ if (!tsk)
return -ENOENT;

if (!dir_emit_dots(file, ctx))
goto out;
- files = get_files_struct(p);
+ files = get_files_struct(tsk);
if (!files)
goto out;

@@ -259,7 +259,7 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,

len = snprintf(name, sizeof(name), "%u", fd);
if (!proc_fill_cache(file, ctx,
- name, len, instantiate, p,
+ name, len, instantiate, tsk,
&data))
goto out_fd_loop;
cond_resched();
@@ -269,7 +269,7 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
out_fd_loop:
put_files_struct(files);
out:
- put_task_struct(p);
+ put_task_struct(tsk);
return 0;
}

--
2.24.1

2020-04-20 21:03:09

by Alexey Dobriyan

[permalink] [raw]
Subject: [PATCH 02/15] sched: make nr_iowait_cpu() return "unsigned int"

Same logic: 2^32 threads stuck waiting in runqueue implies
2^32+ processes total which is absurd.

Per-runqueue ->nr_iowait member being 32-bit hints that it is
correct change!

Signed-off-by: Alexey Dobriyan <[email protected]>
---
drivers/cpuidle/governors/menu.c | 6 +++---
fs/proc/stat.c | 2 +-
include/linux/sched/stat.h | 4 ++--
kernel/sched/core.c | 6 +++---
4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index b0a7ad566081..ddaaa36af290 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -117,7 +117,7 @@ struct menu_device {
int interval_ptr;
};

-static inline int which_bucket(u64 duration_ns, unsigned long nr_iowaiters)
+static inline int which_bucket(u64 duration_ns, unsigned int nr_iowaiters)
{
int bucket = 0;

@@ -150,7 +150,7 @@ static inline int which_bucket(u64 duration_ns, unsigned long nr_iowaiters)
* to be, the higher this multiplier, and thus the higher
* the barrier to go to an expensive C state.
*/
-static inline int performance_multiplier(unsigned long nr_iowaiters)
+static inline int performance_multiplier(unsigned int nr_iowaiters)
{
/* for IO wait tasks (per cpu!) we add 10x each */
return 1 + 10 * nr_iowaiters;
@@ -270,7 +270,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
unsigned int predicted_us;
u64 predicted_ns;
u64 interactivity_req;
- unsigned long nr_iowaiters;
+ unsigned int nr_iowaiters;
ktime_t delta_next;
int i, idx;

diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 93ce344f62a5..678feb7b9949 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -198,7 +198,7 @@ static int show_stat(struct seq_file *p, void *v)
"btime %llu\n"
"processes %lu\n"
"procs_running %u\n"
- "procs_blocked %lu\n",
+ "procs_blocked %u\n",
nr_context_switches(),
(unsigned long long)boottime.tv_sec,
total_forks,
diff --git a/include/linux/sched/stat.h b/include/linux/sched/stat.h
index f3b86515bafe..c4bd2fc95219 100644
--- a/include/linux/sched/stat.h
+++ b/include/linux/sched/stat.h
@@ -18,8 +18,8 @@ DECLARE_PER_CPU(unsigned long, process_counts);
extern int nr_processes(void);
unsigned int nr_running(void);
extern bool single_task_running(void);
-extern unsigned long nr_iowait(void);
-extern unsigned long nr_iowait_cpu(int cpu);
+unsigned int nr_iowait(void);
+unsigned int nr_iowait_cpu(int cpu);

static inline int sched_info_on(void)
{
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d9bae602966c..ec98244e9d96 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3428,7 +3428,7 @@ unsigned long long nr_context_switches(void)
* it does become runnable.
*/

-unsigned long nr_iowait_cpu(int cpu)
+unsigned int nr_iowait_cpu(int cpu)
{
return atomic_read(&cpu_rq(cpu)->nr_iowait);
}
@@ -3463,9 +3463,9 @@ unsigned long nr_iowait_cpu(int cpu)
* Task CPU affinities can make all that even more 'interesting'.
*/

-unsigned long nr_iowait(void)
+unsigned int nr_iowait(void)
{
- unsigned long i, sum = 0;
+ unsigned int i, sum = 0;

for_each_possible_cpu(i)
sum += nr_iowait_cpu(i);
--
2.24.1

2020-04-20 21:03:21

by Alexey Dobriyan

[permalink] [raw]
Subject: [PATCH 06/15] print_integer, proc: rewrite /proc/loadavg via print_integer()

Signed-off-by: Alexey Dobriyan <[email protected]>
---
fs/proc/loadavg.c | 26 ++++++++++++++++++++------
1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index f32878d9a39f..4540a894db22 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -9,19 +9,33 @@
#include <linux/seq_file.h>
#include <linux/seqlock.h>
#include <linux/time.h>
+#include "../../lib/print-integer.h"

static int loadavg_proc_show(struct seq_file *m, void *v)
{
unsigned long avnrun[3];
+ char buf[3 * (LEN_UL + 1 + 2 + 1) + 10 + 1 + 10 + 1 + 10 + 1];
+ char *p = buf + sizeof(buf);
+ int i;
+
+ *--p = '\n';
+ p = _print_integer_u32(p, idr_get_cursor(&task_active_pid_ns(current)->idr) - 1);
+ *--p = ' ';
+ p = _print_integer_u32(p, nr_threads);
+ *--p = '/';
+ p = _print_integer_u32(p, nr_running());

get_avenrun(avnrun, FIXED_1/200, 0);
+ for (i = 2; i >= 0; i--) {
+ *--p = ' ';
+ --p; /* overwritten */
+ *--p = '0'; /* conditionally overwritten */
+ (void)_print_integer_u32(p + 2, LOAD_FRAC(avnrun[i]));
+ *--p = '.';
+ p = _print_integer_ul(p, LOAD_INT(avnrun[i]));
+ }

- seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %u/%d %d\n",
- LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
- LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
- LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
- nr_running(), nr_threads,
- idr_get_cursor(&task_active_pid_ns(current)->idr) - 1);
+ seq_write(m, p, buf + sizeof(buf) - p);
return 0;
}

--
2.24.1

2020-04-20 21:05:38

by Alexey Dobriyan

[permalink] [raw]
Subject: [PATCH 08/15] print_integer, proc: rewrite /proc/uptime via print_integer()

Signed-off-by: Alexey Dobriyan <[email protected]>
---
fs/proc/uptime.c | 24 +++++++++++++++++++-----
1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 5a1b228964fb..a8190078d595 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -8,10 +8,14 @@
#include <linux/time_namespace.h>
#include <linux/kernel_stat.h>

+#include "../../lib/print-integer.h"
+
static int uptime_proc_show(struct seq_file *m, void *v)
{
struct timespec64 uptime;
struct timespec64 idle;
+ char buf[(LEN_UL + 1 + 2 + 1) * 2];
+ char *p = buf + sizeof(buf);
u64 nsec;
u32 rem;
int i;
@@ -25,11 +29,21 @@ static int uptime_proc_show(struct seq_file *m, void *v)

idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
idle.tv_nsec = rem;
- seq_printf(m, "%lu.%02lu %lu.%02lu\n",
- (unsigned long) uptime.tv_sec,
- (uptime.tv_nsec / (NSEC_PER_SEC / 100)),
- (unsigned long) idle.tv_sec,
- (idle.tv_nsec / (NSEC_PER_SEC / 100)));
+
+ *--p = '\n';
+ --p; /* overwritten */
+ *--p = '0';
+ (void)_print_integer_u32(p + 2, idle.tv_nsec / (NSEC_PER_SEC / 100));
+ *--p = '.';
+ p = _print_integer_ul(p, (unsigned long)idle.tv_sec);
+ *--p = ' ';
+ --p; /* overwritten */
+ *--p = '0';
+ (void)_print_integer_u32(p + 2, uptime.tv_nsec / (NSEC_PER_SEC / 100));
+ *--p = '.';
+ p = _print_integer_ul(p, (unsigned long)uptime.tv_sec);
+
+ seq_write(m, p, buf + sizeof(buf) - p);
return 0;
}

--
2.24.1

2020-04-20 21:06:09

by Alexey Dobriyan

[permalink] [raw]
Subject: [PATCH 05/15] print_integer, proc: rewrite /proc/thread-self via print_integer()

Signed-off-by: Alexey Dobriyan <[email protected]>
---
fs/proc/thread_self.c | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index f61ae53533f5..c29d37e3bd28 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -15,14 +15,20 @@ static const char *proc_thread_self_get_link(struct dentry *dentry,
struct pid_namespace *ns = proc_pid_ns(inode);
pid_t tgid = task_tgid_nr_ns(current, ns);
pid_t pid = task_pid_nr_ns(current, ns);
+ char buf[10 + 6 + 10 + 1];
+ char *p = buf + sizeof(buf);
char *name;

if (!pid)
return ERR_PTR(-ENOENT);
- name = kmalloc(10 + 6 + 10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC);
+ name = kmalloc(sizeof(buf), dentry ? GFP_KERNEL : GFP_ATOMIC);
if (unlikely(!name))
return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD);
- sprintf(name, "%u/task/%u", tgid, pid);
+ *--p = '\0';
+ p = _print_integer_u32(p, pid);
+ p = memcpy(p - 6, "/task/", 6);
+ p = _print_integer_u32(p, tgid);
+ memcpy(name, p, buf + sizeof(buf) - p);
set_delayed_call(done, kfree_link, name);
return name;
}
--
2.24.1

2020-04-20 21:07:38

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH 01/15] sched: make nr_running() return "unsigned int"

On Mon, Apr 20, 2020 at 11:57:29PM +0300, Alexey Dobriyan wrote:
> I don't anyone have been crazy enough to spawn 2^32 threads.
> It'd require absurd amounts of physical memory, and bump into futex pid
> limit anyway.
>
> Meanwhile save few bits on REX prefixes and some stack space for upcoming
> print_integer() stuff.
>
> And remove "extern" from prototypes while I'm at it.

It seems like there's a few more places to fix in this regard?

kernel/sched/fair.c:static u64 __sched_period(unsigned long nr_running)
kernel/sched/sched.h: unsigned long dl_nr_running;
kernel/sched/core.c:unsigned long nr_iowait_cpu(int cpu)
kernel/sched/core.c:unsigned long nr_iowait(void)
kernel/sched/loadavg.c: long nr_active, delta = 0;
kernel/sched/sched.h: unsigned long rt_nr_migratory;
kernel/sched/sched.h: unsigned long rt_nr_total;
kernel/sched/sched.h: unsigned long rt_nr_boosted;
kernel/sched/sched.h: unsigned long dl_nr_running;
kernel/sched/sched.h: unsigned long dl_nr_migratory;
kernel/sched/sched.h: unsigned long nr_uninterruptible;

2020-04-20 21:20:50

by Andy Shevchenko

[permalink] [raw]
Subject: Re: [PATCH 03/15] print_integer: new and improved way of printing integers

On Mon, Apr 20, 2020 at 11:57:31PM +0300, Alexey Dobriyan wrote:
> Time honored way to print integers via vsnprintf() or equivalent has
> unavoidable slowdown of parsing format string. This can't be fixed in C,
> without introducing external preprocessor.
>
> seq_put_decimal_ull() partially saves the day, but there are a lot of
> branches inside and overcopying still.
>
> _print_integer_*() family of functions is meant to make printing
> integers as fast as possible by deleting format string parsing and doing
> as little work as possible.
>
> It is based on the following observations:
>
> 1) memcpy is done in forward direction
> it can be done backwards but nobody does that,
>
> 2) digits can be extracted in a very simple loop which costs only
> 1 multiplication and shift (division by constant is not division)
>
> All the above asks for the following signature, semantics and pattern of
> printing out beloved /proc files:
>
> /* seq_printf(seq, "%u %llu\n", A, b); */
>
> char buf[10 + 1 + 20 + 1];
> char *p = buf + sizeof(buf);
>
> *--p = '\n';
> p = _print_integer_u64(p, B);
> *--p = ' ';
> p = _print_integer_u32(p, A);
>
> seq_write(seq, p, buf + sizeof(buf) - p);
>
> 1) stack buffer capable of holding the biggest string is allocated.
>
> 2) "p" is pointer to start of the string. Initially it points past
> the end of the buffer WHICH IS NOT NUL-TERMINATED!
>
> 3) _print_integer_*() actually prints an integer from right to left
> and returns new start of the string.
>
> <--------|
> 123
> ^
> |
> +-- p
>
> 4) 1 character is printed with
>
> *--p = 'x';
>
> It generates very efficient code as multiple writes can be
> merged.
>
> 5) fixed string is printed with
>
> p = memcpy(p - 3, "foo", 3);
>
> Complers know what memcpy() does and write-combine it.
> 4/8-byte writes become 1 instruction and are very efficient.
>
> 6) Once everything is printed, the result is written to seq_file buffer.
> It does only one overflow check and 1 copy.
>
> This generates very efficient code (and small!).
>
> In regular seq_printf() calls, first argument and format string are
> constantly reloaded. Format string will most likely with [rip+...] which
> is quite verbose.
>
> seq_put_decimal_ull() will do branches (and even more branches
> with "width" argument)
>

> TODO
> benchmark with mainline because nouveau is broken for me -(
> vsnprintf() changes make the code slower

Exactly main point of this exercise. I don't believe that algos in vsprintf.c
are too dumb to use division per digit (yes, division by constant which is not
power of two is a heavy operation).


> +noinline
> +char *_print_integer_u32(char *p, u32 x)
> +{
> + do {
> + *--p = '0' + (x % 10);
> + } while (x /= 10);
> + return p;
> +}

> +noinline
> +char *_print_integer_u64(char *p, u64 x)
> +{
> + while (x >= 100 * 1000 * 1000) {
> + u32 r;
> +
> + x = div_u64_rem(x, 100 * 1000 * 1000, &r);
> + p = memset(p - 8, '0', 8);
> + (void)_print_integer_u32(p + 8, r);
> + }
> + return _print_integer_u32(p, x);
> +}

--
With Best Regards,
Andy Shevchenko


2020-04-20 21:28:46

by Andy Shevchenko

[permalink] [raw]
Subject: Re: [PATCH 03/15] print_integer: new and improved way of printing integers

On Tue, Apr 21, 2020 at 12:19:11AM +0300, Andy Shevchenko wrote:
> On Mon, Apr 20, 2020 at 11:57:31PM +0300, Alexey Dobriyan wrote:
> > Time honored way to print integers via vsnprintf() or equivalent has
> > unavoidable slowdown of parsing format string. This can't be fixed in C,
> > without introducing external preprocessor.
> >
> > seq_put_decimal_ull() partially saves the day, but there are a lot of
> > branches inside and overcopying still.
> >
> > _print_integer_*() family of functions is meant to make printing
> > integers as fast as possible by deleting format string parsing and doing
> > as little work as possible.
> >
> > It is based on the following observations:
> >
> > 1) memcpy is done in forward direction
> > it can be done backwards but nobody does that,
> >
> > 2) digits can be extracted in a very simple loop which costs only
> > 1 multiplication and shift (division by constant is not division)
> >
> > All the above asks for the following signature, semantics and pattern of
> > printing out beloved /proc files:
> >
> > /* seq_printf(seq, "%u %llu\n", A, b); */
> >
> > char buf[10 + 1 + 20 + 1];
> > char *p = buf + sizeof(buf);
> >
> > *--p = '\n';
> > p = _print_integer_u64(p, B);
> > *--p = ' ';
> > p = _print_integer_u32(p, A);
> >
> > seq_write(seq, p, buf + sizeof(buf) - p);
> >
> > 1) stack buffer capable of holding the biggest string is allocated.
> >
> > 2) "p" is pointer to start of the string. Initially it points past
> > the end of the buffer WHICH IS NOT NUL-TERMINATED!
> >
> > 3) _print_integer_*() actually prints an integer from right to left
> > and returns new start of the string.
> >
> > <--------|
> > 123
> > ^
> > |
> > +-- p
> >
> > 4) 1 character is printed with
> >
> > *--p = 'x';
> >
> > It generates very efficient code as multiple writes can be
> > merged.
> >
> > 5) fixed string is printed with
> >
> > p = memcpy(p - 3, "foo", 3);
> >
> > Complers know what memcpy() does and write-combine it.
> > 4/8-byte writes become 1 instruction and are very efficient.
> >
> > 6) Once everything is printed, the result is written to seq_file buffer.
> > It does only one overflow check and 1 copy.
> >
> > This generates very efficient code (and small!).
> >
> > In regular seq_printf() calls, first argument and format string are
> > constantly reloaded. Format string will most likely with [rip+...] which
> > is quite verbose.
> >
> > seq_put_decimal_ull() will do branches (and even more branches
> > with "width" argument)
> >
>
> > TODO
> > benchmark with mainline because nouveau is broken for me -(
> > vsnprintf() changes make the code slower
>
> Exactly main point of this exercise. I don't believe that algos in vsprintf.c
> are too dumb to use division per digit (yes, division by constant which is not
> power of two is a heavy operation).
>

And second point here, why not to use existing algos from vsprintf.c?

--
With Best Regards,
Andy Shevchenko


2020-04-21 01:57:41

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 03/15] print_integer: new and improved way of printing integers

On Tue, 21 Apr 2020 00:27:23 +0300
Andy Shevchenko <[email protected]> wrote:

> >
> > > TODO
> > > benchmark with mainline because nouveau is broken for me -(
> > > vsnprintf() changes make the code slower
> >
> > Exactly main point of this exercise. I don't believe that algos in vsprintf.c
> > are too dumb to use division per digit (yes, division by constant which is not
> > power of two is a heavy operation).
> >
>
> And second point here, why not to use existing algos from vsprintf.c?

Exactly. The code in _print_integer_u32() doesn't look as fast as the
code in vsprintf() that happens to use lookup tables and converts
without any loops.

Hint, loops are bad, they cause the CPU to slow down.

Anyway, this patch series would require a pretty good improvement, as
the code replacing the sprintf() usages is pretty ugly compared to a
simple sprintf() call.

Randomly picking patch 6:

static int loadavg_proc_show(struct seq_file *m, void *v)
{
unsigned long avnrun[3];

get_avenrun(avnrun, FIXED_1/200, 0);

seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %u/%d %d\n",
LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
nr_running(), nr_threads,
idr_get_cursor(&task_active_pid_ns(current)->idr) - 1);
return 0;
}

*vs*

static int loadavg_proc_show(struct seq_file *m, void *v)
{
unsigned long avnrun[3];
char buf[3 * (LEN_UL + 1 + 2 + 1) + 10 + 1 + 10 + 1 + 10 + 1];
char *p = buf + sizeof(buf);
int i;

*--p = '\n';
p = _print_integer_u32(p, idr_get_cursor(&task_active_pid_ns(current)->idr) - 1);
*--p = ' ';
p = _print_integer_u32(p, nr_threads);
*--p = '/';
p = _print_integer_u32(p, nr_running());

get_avenrun(avnrun, FIXED_1/200, 0);
for (i = 2; i >= 0; i--) {
*--p = ' ';
--p; /* overwritten */
*--p = '0'; /* conditionally overwritten */
(void)_print_integer_u32(p + 2, LOAD_FRAC(avnrun[i]));
*--p = '.';
p = _print_integer_ul(p, LOAD_INT(avnrun[i]));
}

seq_write(m, p, buf + sizeof(buf) - p);
return 0;
}


I much rather keep the first version.

-- Steve

2020-04-21 02:05:01

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 14/15] print_integer, printf: rewrite the rest of lib/vsprintf.c via print_integer()

On Mon, 20 Apr 2020 23:57:42 +0300
Alexey Dobriyan <[email protected]> wrote:

> Lookup tables are cheating.

But damn fast!


>
> Signed-off-by: Alexey Dobriyan <[email protected]>
> ---
>

NACK!

-- Steve

2020-04-21 16:33:40

by Alexey Dobriyan

[permalink] [raw]
Subject: Re: [PATCH 03/15] print_integer: new and improved way of printing integers

On Tue, Apr 21, 2020 at 12:19:11AM +0300, Andy Shevchenko wrote:
> On Mon, Apr 20, 2020 at 11:57:31PM +0300, Alexey Dobriyan wrote:
> > Time honored way to print integers via vsnprintf() or equivalent has
> > unavoidable slowdown of parsing format string. This can't be fixed in C,
> > without introducing external preprocessor.
> >
> > seq_put_decimal_ull() partially saves the day, but there are a lot of
> > branches inside and overcopying still.
> >
> > _print_integer_*() family of functions is meant to make printing
> > integers as fast as possible by deleting format string parsing and doing
> > as little work as possible.
> >
> > It is based on the following observations:
> >
> > 1) memcpy is done in forward direction
> > it can be done backwards but nobody does that,
> >
> > 2) digits can be extracted in a very simple loop which costs only
> > 1 multiplication and shift (division by constant is not division)
> >
> > All the above asks for the following signature, semantics and pattern of
> > printing out beloved /proc files:
> >
> > /* seq_printf(seq, "%u %llu\n", A, b); */
> >
> > char buf[10 + 1 + 20 + 1];
> > char *p = buf + sizeof(buf);
> >
> > *--p = '\n';
> > p = _print_integer_u64(p, B);
> > *--p = ' ';
> > p = _print_integer_u32(p, A);
> >
> > seq_write(seq, p, buf + sizeof(buf) - p);
> >
> > 1) stack buffer capable of holding the biggest string is allocated.
> >
> > 2) "p" is pointer to start of the string. Initially it points past
> > the end of the buffer WHICH IS NOT NUL-TERMINATED!
> >
> > 3) _print_integer_*() actually prints an integer from right to left
> > and returns new start of the string.
> >
> > <--------|
> > 123
> > ^
> > |
> > +-- p
> >
> > 4) 1 character is printed with
> >
> > *--p = 'x';
> >
> > It generates very efficient code as multiple writes can be
> > merged.
> >
> > 5) fixed string is printed with
> >
> > p = memcpy(p - 3, "foo", 3);
> >
> > Complers know what memcpy() does and write-combine it.
> > 4/8-byte writes become 1 instruction and are very efficient.
> >
> > 6) Once everything is printed, the result is written to seq_file buffer.
> > It does only one overflow check and 1 copy.
> >
> > This generates very efficient code (and small!).
> >
> > In regular seq_printf() calls, first argument and format string are
> > constantly reloaded. Format string will most likely with [rip+...] which
> > is quite verbose.
> >
> > seq_put_decimal_ull() will do branches (and even more branches
> > with "width" argument)
> >
>
> > TODO
> > benchmark with mainline because nouveau is broken for me -(
> > vsnprintf() changes make the code slower
>
> Exactly main point of this exercise. I don't believe that algos in vsprintf.c
> are too dumb to use division per digit (yes, division by constant which is not
> power of two is a heavy operation).

It is not about division.

It is about fucktons of branches in vsprintf().

> > +noinline
> > +char *_print_integer_u32(char *p, u32 x)
> > +{
> > + do {
> > + *--p = '0' + (x % 10);
> > + } while (x /= 10);
> > + return p;
> > +}
>
> > +noinline
> > +char *_print_integer_u64(char *p, u64 x)
> > +{
> > + while (x >= 100 * 1000 * 1000) {
> > + u32 r;
> > +
> > + x = div_u64_rem(x, 100 * 1000 * 1000, &r);
> > + p = memset(p - 8, '0', 8);
> > + (void)_print_integer_u32(p + 8, r);
> > + }
> > + return _print_integer_u32(p, x);
> > +}
>
> --
> With Best Regards,
> Andy Shevchenko
>
>

2020-04-21 16:51:39

by Alexey Dobriyan

[permalink] [raw]
Subject: Re: [PATCH 03/15] print_integer: new and improved way of printing integers

On Mon, Apr 20, 2020 at 09:54:17PM -0400, Steven Rostedt wrote:
> On Tue, 21 Apr 2020 00:27:23 +0300
> Andy Shevchenko <[email protected]> wrote:
>
> > >
> > > > TODO
> > > > benchmark with mainline because nouveau is broken for me -(
> > > > vsnprintf() changes make the code slower
> > >
> > > Exactly main point of this exercise. I don't believe that algos in vsprintf.c
> > > are too dumb to use division per digit (yes, division by constant which is not
> > > power of two is a heavy operation).
> > >
> >
> > And second point here, why not to use existing algos from vsprintf.c?
>
> Exactly. The code in _print_integer_u32() doesn't look as fast as the
> code in vsprintf() that happens to use lookup tables and converts
> without any loops.
>
> Hint, loops are bad, they cause the CPU to slow down.

Oh, come on! Loops make code fit into icache and μop decode cache.

> Anyway, this patch series would require a pretty good improvement, as
> the code replacing the sprintf() usages is pretty ugly compared to a
> simple sprintf() call.

No! Fast code must look ugly. Or in other words if you try to optimise
integer printing to death you'll probably end with something like
_print_integer().

When the very first patch changed /proc/stat to seq_put_decimal_ull()
the speed up was 66% (or 33%). That's how slow printing was back then.
It can be made slightly faster even now.

> Randomly picking patch 6:
>
> static int loadavg_proc_show(struct seq_file *m, void *v)
> {
> unsigned long avnrun[3];
>
> get_avenrun(avnrun, FIXED_1/200, 0);
>
> seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %u/%d %d\n",
> LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
> LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
> LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
> nr_running(), nr_threads,
> idr_get_cursor(&task_active_pid_ns(current)->idr) - 1);
> return 0;
> }
>
> *vs*
>
> static int loadavg_proc_show(struct seq_file *m, void *v)
> {
> unsigned long avnrun[3];
> char buf[3 * (LEN_UL + 1 + 2 + 1) + 10 + 1 + 10 + 1 + 10 + 1];
> char *p = buf + sizeof(buf);
> int i;
>
> *--p = '\n';
> p = _print_integer_u32(p, idr_get_cursor(&task_active_pid_ns(current)->idr) - 1);
> *--p = ' ';
> p = _print_integer_u32(p, nr_threads);
> *--p = '/';
> p = _print_integer_u32(p, nr_running());
>
> get_avenrun(avnrun, FIXED_1/200, 0);
> for (i = 2; i >= 0; i--) {
> *--p = ' ';
> --p; /* overwritten */
> *--p = '0'; /* conditionally overwritten */
> (void)_print_integer_u32(p + 2, LOAD_FRAC(avnrun[i]));
> *--p = '.';
> p = _print_integer_ul(p, LOAD_INT(avnrun[i]));
> }
>
> seq_write(m, p, buf + sizeof(buf) - p);
> return 0;
> }
>
>
> I much rather keep the first version.

I did the benchmarks (without stack protector though), everything except
/proc/cpuinfo and /proc/meminfo became faster. This requires investigation
and I can drop vsprintf() changes until then.

Now given that /proc/uptime format cast in stone, code may look a bit ugly
and unusual but it won't require maintainance

2020-04-21 17:01:14

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH 03/15] print_integer: new and improved way of printing integers

On Mon, Apr 20, 2020 at 11:57:31PM +0300, Alexey Dobriyan wrote:
> 1) memcpy is done in forward direction
> it can be done backwards but nobody does that,

If you're determined to do this, then use memmove() which actually
guarantees to work with overlapping ranges. Don't rely on non-guaranteed
behaviour of current implementations of memcpy(). Did you really check
the two dozen assembly implementations of memcpy() in the kernel?

> 2) digits can be extracted in a very simple loop which costs only
> 1 multiplication and shift (division by constant is not division)

> +noinline
> +char *_print_integer_u32(char *p, u32 x)
> +{
> + do {
> + *--p = '0' + (x % 10);
> + } while (x /= 10);
> + return p;
> +}

Why not do two digits at a time like put_dec() does?

2020-04-21 17:10:06

by Alexey Dobriyan

[permalink] [raw]
Subject: Re: [PATCH 01/15] sched: make nr_running() return "unsigned int"

On Mon, Apr 20, 2020 at 02:05:57PM -0700, Matthew Wilcox wrote:
> On Mon, Apr 20, 2020 at 11:57:29PM +0300, Alexey Dobriyan wrote:
> > I don't anyone have been crazy enough to spawn 2^32 threads.
> > It'd require absurd amounts of physical memory, and bump into futex pid
> > limit anyway.
> >
> > Meanwhile save few bits on REX prefixes and some stack space for upcoming
> > print_integer() stuff.
> >
> > And remove "extern" from prototypes while I'm at it.
>
> It seems like there's a few more places to fix in this regard?
>
> kernel/sched/fair.c:static u64 __sched_period(unsigned long nr_running)
> kernel/sched/sched.h: unsigned long dl_nr_running;
> kernel/sched/core.c:unsigned long nr_iowait_cpu(int cpu)
> kernel/sched/core.c:unsigned long nr_iowait(void)
> kernel/sched/loadavg.c: long nr_active, delta = 0;
> kernel/sched/sched.h: unsigned long rt_nr_migratory;
> kernel/sched/sched.h: unsigned long rt_nr_total;
> kernel/sched/sched.h: unsigned long rt_nr_boosted;
> kernel/sched/sched.h: unsigned long dl_nr_running;
> kernel/sched/sched.h: unsigned long dl_nr_migratory;
> kernel/sched/sched.h: unsigned long nr_uninterruptible;

Sure. I changed nr_running() and nr_iowait() because they're in format
strings in /proc as %lu.

2020-04-21 21:10:17

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 03/15] print_integer: new and improved way of printing integers

On Tue, 21 Apr 2020 19:49:24 +0300
Alexey Dobriyan <[email protected]> wrote:
> > Exactly. The code in _print_integer_u32() doesn't look as fast as the
> > code in vsprintf() that happens to use lookup tables and converts
> > without any loops.
> >
> > Hint, loops are bad, they cause the CPU to slow down.
>
> Oh, come on! Loops make code fit into icache and μop decode cache.

Depends on the architecture.

>
> > Anyway, this patch series would require a pretty good improvement, as
> > the code replacing the sprintf() usages is pretty ugly compared to a
> > simple sprintf() call.
>
> No! Fast code must look ugly. Or in other words if you try to optimise
> integer printing to death you'll probably end with something like
> _print_integer().

As I stated, it will require a "pretty good improvement". There's always a
trade off. If the improvement is noticeable for real life cases, then ugly
code is worth it. If we are making ugly code for a benefit of something
that never shows outside of noise, then the net cost (less maintainable
code), is not worth it.

>
> When the very first patch changed /proc/stat to seq_put_decimal_ull()
> the speed up was 66% (or 33%). That's how slow printing was back then.
> It can be made slightly faster even now.

I'd like to see the tests that your ran (to reproduce them myself).

The first patch was making a 64bit number into 32bit number, thus
shortening the work by half.

>
> > Randomly picking patch 6:
> >
> > static int loadavg_proc_show(struct seq_file *m, void *v)
> > {
> > unsigned long avnrun[3];
> >
> > get_avenrun(avnrun, FIXED_1/200, 0);
> >
> > seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %u/%d %d\n",
> > LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
> > LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
> > LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
> > nr_running(), nr_threads,
> > idr_get_cursor(&task_active_pid_ns(current)->idr) - 1);
> > return 0;
> > }
> >
> > *vs*
> >
> > static int loadavg_proc_show(struct seq_file *m, void *v)
> > {
> > unsigned long avnrun[3];
> > char buf[3 * (LEN_UL + 1 + 2 + 1) + 10 + 1 + 10 + 1 + 10 + 1];
> > char *p = buf + sizeof(buf);
> > int i;
> >
> > *--p = '\n';
> > p = _print_integer_u32(p, idr_get_cursor(&task_active_pid_ns(current)->idr) - 1);
> > *--p = ' ';
> > p = _print_integer_u32(p, nr_threads);
> > *--p = '/';
> > p = _print_integer_u32(p, nr_running());
> >
> > get_avenrun(avnrun, FIXED_1/200, 0);
> > for (i = 2; i >= 0; i--) {
> > *--p = ' ';
> > --p; /* overwritten */
> > *--p = '0'; /* conditionally overwritten */
> > (void)_print_integer_u32(p + 2, LOAD_FRAC(avnrun[i]));
> > *--p = '.';
> > p = _print_integer_ul(p, LOAD_INT(avnrun[i]));
> > }
> >
> > seq_write(m, p, buf + sizeof(buf) - p);
> > return 0;
> > }
> >
> >
> > I much rather keep the first version.
>
> I did the benchmarks (without stack protector though), everything except
> /proc/cpuinfo and /proc/meminfo became faster. This requires investigation
> and I can drop vsprintf() changes until then.
>
> Now given that /proc/uptime format cast in stone, code may look a bit ugly
> and unusual but it won't require maintainance

Please share what you did as your benchmarks. If this is as good of a
performance as you claim, then these changes would be worth looking into.


So I applied your entire series, added the following patch:

diff --git a/include/linux/spinlock_api_smp.h
b/include/linux/spinlock_api_smp.h index 19a9be9d97ee..17c582d77ab7 100644
--- a/include/linux/spinlock_api_smp.h
+++ b/include/linux/spinlock_api_smp.h
@@ -152,9 +152,16 @@ static inline void __raw_spin_unlock(raw_spinlock_t
*lock) preempt_enable();
}

+extern u64 sched_clock(void);
static inline void __raw_spin_unlock_irqrestore(raw_spinlock_t *lock,
unsigned long flags)
{
+ char buf[32];
+ u64 start, stop;
+ start = sched_clock();
+ sprintf(buf,"%lld", (unsigned long)lock);
+ stop = sched_clock();
+ trace_printk("time: %lld '%s'\n", stop - start, buf);
spin_release(&lock->dep_map, _RET_IP_);
do_raw_spin_unlock(lock);
local_irq_restore(flags);


Then after boot up, I did the following:

# trace-cmd stop
# trace-cmd extract

Which captured the traces:

<idle>-0 [003] 5.405208: bprint: _raw_spin_unlock_irqrestore: time: 799 '-110308271193088'
<idle>-0 [003] 5.405210: bprint: _raw_spin_unlock_irqrestore: time: 273 '-110308271193088'
<idle>-0 [003] 5.412235: bprint: _raw_spin_unlock_irqrestore: time: 1138 '-110308271193088'
<idle>-0 [003] 5.412236: bprint: _raw_spin_unlock_irqrestore: time: 213 '-110308271193088'
<idle>-0 [003] 5.414241: bprint: _raw_spin_unlock_irqrestore: time: 1094 '-110308271193088'
<idle>-0 [003] 5.414243: bprint: _raw_spin_unlock_irqrestore: time: 182 '-110308271193088'
<idle>-0 [003] 5.418241: bprint: _raw_spin_unlock_irqrestore: time: 1113 '-110308271193088'


Where "time: X", X is the delta in nanoseconds around the sprintf() call.

The I ran the attached perl program on the output, and got the following
results:

Before your patches:

# trace-cmd report | ./report.pl
full_total = 52844823
average: 255.902176229032
std: 439.269729814847

And with your patches:

# trace-cmd report | ./report.pl
full_total = 84725476
average: 407.873274762306
std: 555.755670463724

As the standard deviation is bigger than the average, it appears to be all
in the noise.

Then I decided to see if it affects "ps -eux"

Original:

# perf stat -r 100 ps -eux > /dev/null

Performance counter stats for 'ps -eux' (100 runs):

8.92 msec task-clock # 0.937 CPUs utilized ( +- 0.90% )
5 context-switches # 0.545 K/sec ( +- 1.24% )
0 cpu-migrations # 0.000 K/sec
259 page-faults # 0.029 M/sec ( +- 0.07% )
32,973,751 cycles # 3.698 GHz ( +- 0.09% )
17,254,307 stalled-cycles-frontend # 52.33% frontend cycles idle ( +- 0.17% )
38,707,960 instructions # 1.17 insn per cycle
# 0.45 stalled cycles per insn ( +- 0.01% )
8,153,197 branches # 914.274 M/sec ( +- 0.01% )
114,992 branch-misses # 1.41% of all branches ( +- 0.12% )

0.0095170 +- 0.0000829 seconds time elapsed ( +- 0.87% )

With your patches:

# perf stat -r 100 ps -eux > /dev/null

Performance counter stats for 'ps -eux' (100 runs):

8.86 msec task-clock # 0.918 CPUs utilized ( +- 1.06% )
5 context-switches # 0.527 K/sec ( +- 1.22% )
0 cpu-migrations # 0.001 K/sec ( +-100.00% )
259 page-faults # 0.029 M/sec ( +- 0.08% )
32,699,168 cycles # 3.692 GHz ( +- 0.12% )
16,995,861 stalled-cycles-frontend # 51.98% frontend cycles idle ( +- 0.21% )
38,114,396 instructions # 1.17 insn per cycle
# 0.45 stalled cycles per insn ( +- 0.03% )
7,985,526 branches # 901.625 M/sec ( +- 0.03% )
112,852 branch-misses # 1.41% of all branches ( +- 0.17% )

0.009652 +- 0.000276 seconds time elapsed ( +- 2.86% )

Not really much difference.

Then what about just catting /proc/$$/stat, and do a 1000 runs!

Original:

# perf stat -r 1000 cat /proc/$$/stat > /dev/null

Performance counter stats for 'cat /proc/1622/stat' (1000 runs):

0.34 msec task-clock # 0.680 CPUs utilized ( +- 0.21% )
0 context-switches # 0.071 K/sec ( +- 20.18% )
0 cpu-migrations # 0.000 K/sec
65 page-faults # 0.192 M/sec ( +- 0.07% )
993,486 cycles # 2.934 GHz ( +- 0.07% )
577,903 stalled-cycles-frontend # 58.17% frontend cycles idle ( +- 0.09% )
936,489 instructions # 0.94 insn per cycle
# 0.62 stalled cycles per insn ( +- 0.07% )
202,912 branches # 599.213 M/sec ( +- 0.07% )
6,976 branch-misses # 3.44% of all branches ( +- 0.11% )

0.00049797 +- 0.00000111 seconds time elapsed ( +- 0.22% )


With your patches:

# perf stat -r 1000 cat /proc/$$/stat > /dev/null

Performance counter stats for 'cat /proc/1624/stat' (1000 runs):

0.34 msec task-clock # 0.664 CPUs utilized ( +- 0.23% )
0 context-switches # 0.018 K/sec ( +- 40.72% )
0 cpu-migrations # 0.000 K/sec
65 page-faults # 0.190 M/sec ( +- 0.07% )
988,430 cycles # 2.892 GHz ( +- 0.07% )
574,841 stalled-cycles-frontend # 58.16% frontend cycles idle ( +- 0.09% )
933,852 instructions # 0.94 insn per cycle
# 0.62 stalled cycles per insn ( +- 0.07% )
202,096 branches # 591.297 M/sec ( +- 0.07% )
6,836 branch-misses # 3.38% of all branches ( +- 0.09% )

0.00051476 +- 0.00000557 seconds time elapsed ( +- 1.08% )


They are pretty much identical.

What's the purpose of all these changes again? There was no cover letter.

-- Steve


Attachments:
(No filename) (10.56 kB)
report.pl (474.00 B)
Download all attachments

2020-04-23 03:48:32

by Sergey Senozhatsky

[permalink] [raw]
Subject: Re: [PATCH 03/15] print_integer: new and improved way of printing integers

On (20/04/21 19:49), Alexey Dobriyan wrote:
>
> I did the benchmarks (without stack protector though), everything except
> /proc/cpuinfo and /proc/meminfo became faster.
>

Why does /proc/cpuinfo and /proc/meminfo performance matter?

-ss