LinuxLists.cc - [PATCH] fscache: fix a kernel BUG at fs/fscache/operation.c:69!

2018-02-22 07:34:34

Subject: [PATCH] fscache: fix a kernel BUG at fs/fscache/operation.c:69!

From: Lei Xue <[email protected]>

There is a potential race in fscache operation enqueuing for reading and
copying multiple pages from cachefiles to netfs.
Under some heavy load system, it will happen very often.

If this race occurs, an oops similar to the following is seen:

kernel BUG at fs/fscache/operation.c:69!
invalid opcode: 0000 [#1] SMP
…
#0 [ffff883fff0838d8] machine_kexec at ffffffff81051beb
#1 [ffff883fff083938] crash_kexec at ffffffff810f2542
#2 [ffff883fff083a08] oops_end at ffffffff8163e1a8
#3 [ffff883fff083a30] die at ffffffff8101859b
#4 [ffff883fff083a60] do_trap at ffffffff8163d860
#5 [ffff883fff083ab0] do_invalid_op at ffffffff81015204
#6 [ffff883fff083b60] invalid_op at ffffffff8164701e
[exception RIP: fscache_enqueue_operation+246]
RIP: ffffffffa0b793c6 RSP: ffff883fff083c18 RFLAGS: 00010046
RAX: 0000000000000019 RBX: ffff8832ed1a9ec0 RCX: 0000000000000006
RDX: 0000000000000000 RSI: 0000000000000046 RDI: 0000000000000046
RBP: ffff883fff083c20 R8: 0000000000000086 R9: 000000000000178f
R10: ffffffff816aeb00 R11: ffff883fff08392e R12: ffff8802f0525620
R13: ffff88407ffc01d8 R14: 0000000000000000 R15: 0000000000000003
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0000
#7 [ffff883fff083c10] fscache_enqueue_operation at ffffffffa0b793c6
#8 [ffff883fff083c28] cachefiles_read_waiter at ffffffffa0b15a48
#9 [ffff883fff083c48] __wake_up_common at ffffffff810af028

Signed-off-by: Lei Xue <[email protected]>
---
fs/cachefiles/rdwr.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 883bc7bb12c5..9d5d13e150fb 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -58,9 +58,9 @@ static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode,

spin_lock(&object->work_lock);
list_add_tail(&monitor->op_link, &monitor->op->to_do);
+ fscache_enqueue_retrieval(monitor->op);
spin_unlock(&object->work_lock);

- fscache_enqueue_retrieval(monitor->op);
return 0;
}

--
2.14.3 (Apple Git-98)

2018-05-08 13:45:20

by Vegard Nossum

[permalink] [raw]

Subject: Re: [PATCH] fscache: fix a kernel BUG at fs/fscache/operation.c:69!

On 22 February 2018 at 08:33, <[email protected]> wrote:
> From: Lei Xue <[email protected]>
>
> There is a potential race in fscache operation enqueuing for reading and
> copying multiple pages from cachefiles to netfs.
> Under some heavy load system, it will happen very often.
>
> If this race occurs, an oops similar to the following is seen:
>
> kernel BUG at fs/fscache/operation.c:69!
> invalid opcode: 0000 [#1] SMP
> …
> #0 [ffff883fff0838d8] machine_kexec at ffffffff81051beb
> #1 [ffff883fff083938] crash_kexec at ffffffff810f2542
> #2 [ffff883fff083a08] oops_end at ffffffff8163e1a8
> #3 [ffff883fff083a30] die at ffffffff8101859b
> #4 [ffff883fff083a60] do_trap at ffffffff8163d860
> #5 [ffff883fff083ab0] do_invalid_op at ffffffff81015204
> #6 [ffff883fff083b60] invalid_op at ffffffff8164701e
> [exception RIP: fscache_enqueue_operation+246]
> RIP: ffffffffa0b793c6 RSP: ffff883fff083c18 RFLAGS: 00010046
> RAX: 0000000000000019 RBX: ffff8832ed1a9ec0 RCX: 0000000000000006
> RDX: 0000000000000000 RSI: 0000000000000046 RDI: 0000000000000046
> RBP: ffff883fff083c20 R8: 0000000000000086 R9: 000000000000178f
> R10: ffffffff816aeb00 R11: ffff883fff08392e R12: ffff8802f0525620
> R13: ffff88407ffc01d8 R14: 0000000000000000 R15: 0000000000000003
> ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0000
> #7 [ffff883fff083c10] fscache_enqueue_operation at ffffffffa0b793c6
> #8 [ffff883fff083c28] cachefiles_read_waiter at ffffffffa0b15a48
> #9 [ffff883fff083c48] __wake_up_common at ffffffff810af028
>
> Signed-off-by: Lei Xue <[email protected]>
> ---
> fs/cachefiles/rdwr.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
> index 883bc7bb12c5..9d5d13e150fb 100644
> --- a/fs/cachefiles/rdwr.c
> +++ b/fs/cachefiles/rdwr.c
> @@ -58,9 +58,9 @@ static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode,
>
> spin_lock(&object->work_lock);
> list_add_tail(&monitor->op_link, &monitor->op->to_do);
> + fscache_enqueue_retrieval(monitor->op);
> spin_unlock(&object->work_lock);
>
> - fscache_enqueue_retrieval(monitor->op);
> return 0;
> }

Hi,

Just wondering what the status of this patch is?

We've been hitting a similar problem and arrived at the same patch as
a potential fix for it.

Our crashes look like this:

WARNING: CPU: 0 PID: 120693 at kernel/workqueue.c:618 insert_work+0x5f/0x70
Modules linked in: nbd
CPU: 0 PID: 120693 Comm: sh Not tainted 4.16.2-0 #1
Hardware name: Oracle Corporation Sun Fire X4800/20434, BIOS 11080200
08/12/2016
RIP: 0010:insert_work+0x5f/0x70
RSP: 0018:ffff88103fa039b8 EFLAGS: 00010046
RAX: ffff88103f443f00 RBX: ffff880187c37c00 RCX: 0000000000000005
RDX: ffff880187c37c20 RSI: ffff8807c04dec00 RDI: 0000000000000000
RBP: ffff88103fa039c8 R08: 0000000000000101 R09: 0000000000000001
R10: ffff887eee68fd40 R11: 0000000000000001 R12: ffff88503fafc600
R13: 000000000001cf60 R14: ffff880187c37c00 R15: ffff88103f443f00
FS: 0000000000000000(0000) GS:ffff88103fa00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f394d2780a0 CR3: 0000000bcc936000 CR4: 00000000000006f0
Call Trace:
<IRQ>
__queue_work+0x11f/0x320
queue_work_on+0x19/0x30
fscache_enqueue_operation+0x83/0x160
cachefiles_read_waiter+0xd2/0x130
__wake_up_common+0x81/0x120
__wake_up_locked_key_bookmark+0x16/0x20
wake_up_page_bit+0x97/0xe0
unlock_page+0x20/0x30
page_endio+0x21/0xa0
mpage_end_io+0x41/0x60
bio_endio+0x78/0x90
dec_pending+0x140/0x250
? linear_status+0x40/0x40
clone_endio+0x86/0x100
bio_endio+0x78/0x90
blk_update_request+0x8d/0x2b0
scsi_end_request+0x36/0x200
scsi_io_completion+0x12a/0x5e0
scsi_finish_command+0xf2/0x150
scsi_softirq_done+0x13e/0x160
__blk_mq_complete_request+0xb8/0x180
blk_mq_complete_request+0x57/0x70
scsi_mq_done+0x10/0x20
megasas_complete_cmd+0xdf/0x620
megasas_complete_cmd_dpc+0x8f/0x100
tasklet_action+0x9a/0xb0
__do_softirq+0xbf/0x1c8
irq_exit+0x9c/0xb0
do_IRQ+0x5b/0xe0
common_interrupt+0xf/0xf
</IRQ>
RIP: 0010:_raw_spin_unlock_irqrestore+0x9/0x10
RSP: 0018:ffffc900309e3cf8 EFLAGS: 00000296 ORIG_RAX: ffffffffffffffde
RAX: 0000000000000002 RBX: 0000000000000002 RCX: 0000000000000001
RDX: ffffea0006793fe0 RSI: 0000000000000296 RDI: ffff88107ffff800
RBP: ffffc900309e3cf8 R08: 0000000000000002 R09: 000000000011b912
R10: 00000000000000e7 R11: 0000000000000000 R12: ffffea0014baa000
R13: ffff88103fa1d120 R14: ffff88107fff6000 R15: ffff88107fff6000
pagevec_lru_move_fn+0xb7/0xe0
? pagevec_move_tail_fn+0x350/0x350
__pagevec_lru_add+0x12/0x20
lru_add_drain_cpu+0xc4/0xe0
lru_add_drain+0x10/0x20
exit_mmap+0x58/0x190
? __handle_mm_fault+0x9a4/0x1540
? hrtimer_try_to_cancel+0x1b/0xa0
mmput+0x4e/0x100
do_exit+0x22f/0xa10
do_group_exit+0x3a/0xa0
SyS_exit_group+0x12/0x20
do_syscall_64+0x61/0x110
entry_SYSCALL_64_after_hwframe+0x3d/0xa2
RIP: 0033:0x7f394d325fa8
RSP: 002b:00007ffda407e668 EFLAGS: 00000246 ORIG_RAX: 00000000000000e7
RAX: ffffffffffffffda RBX: 00007f394d603830 RCX: 00007f394d325fa8
RDX: 0000000000000000 RSI: 000000000000003c RDI: 0000000000000000
RBP: 0000000000000000 R08: 00000000000000e7 R09: ffffffffffffffa8
R10: 00007f394d6097e0 R11: 0000000000000246 R12: 00007f394d603830
R13: 00007ffda407f8b2 R14: 0000000000000000 R15: 0000000000000000
Code: 48 89 4e 10 4c 89 01 e8 30 f8 ff ff f0 83 44 24 fc 00 8b 83 40
03 00 00 85 c0 75 08 f0 ff ff 48 83 c4 08 5b 5d c3 <0f> 0b eb b7 66 66
66 66 2e 0f 1f 84 00 00 00 00 00 55 48 89 e5
---[ end trace e5ddade8fd0233c4 ]---

Some inconclusive notes, FWIW:

First we get the workqueue warning where it's hitting the warning in
set_work_data() where it checks that the work is not pending:

static inline void set_work_data(struct work_struct *work, unsigned long data,
unsigned long flags)
{
WARN_ON_ONCE(!work_pending(work));
atomic_long_set(&work->data, data | flags | work_static(work));
}

This is odd because the flag should have been set just a few moments
before, in queue_work_on():

bool queue_work_on(int cpu, struct workqueue_struct *wq,
struct work_struct *work)
{
bool ret = false;
unsigned long flags;

local_irq_save(flags);

if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
__queue_work(cpu, wq, work);
ret = true;
}

local_irq_restore(flags);
return ret;
}

Then we have the next part giving a few more clues:

FS-Cache:
FS-Cache: Assertion failed
FS-Cache: 6 == 5 is false
------------[ cut here ]------------
kernel BUG at fs/fscache/operation.c:494!
invalid opcode: 0000 [#1] SMP
Modules linked in: nbd
CPU: 1 PID: 122494 Comm: kworker/u257:3 Tainted: G W 4.16.2-0 #1
Hardware name: Oracle Corporation Sun Fire X4800/20434, BIOS 11080200
08/12/2016
Workqueue: fscache_operation fscache_op_work_func
RIP: 0010:fscache_put_operation+0x1e3/0x1f0
RSP: 0018:ffffc9002ff8be08 EFLAGS: 00010296
RAX: 0000000000000019 RBX: ffff8807c04dec00 RCX: 0000000000000000
RDX: 0000000000000001 RSI: ffff88103fa56398 RDI: ffff88103fa56398
RBP: ffffc9002ff8be18 R08: 0000000000000000 R09: 0000000000000771
R10: ffffffff820afc80 R11: 0000000000000770 R12: ffff8807c04dec00
R13: ffff88103f443f00 R14: 0000000000000000 R15: ffff880187c37c00
FS: 0000000000000000(0000) GS:ffff88103fa40000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000002343008 CR3: 000000000240a000 CR4: 00000000000006e0
Call Trace:
fscache_op_work_func+0x26/0x50
process_one_work+0x131/0x290
worker_thread+0x45/0x360
kthread+0xf8/0x130
? create_worker+0x190/0x190
? kthread_cancel_work_sync+0x10/0x10
ret_from_fork+0x1f/0x30
Code: 31 c0 e8 26 11 ef ff 48 c7 c7 92 ee 2c 82 31 c0 e8 18 11 ef ff
8b 73 40 ba 05 00 00 ad 2a 82 31 c0 e8 02 11 ef ff <0f> 0b 66 66 2e 0f
1f 84 00 00 00 00 00 55 48 89 e5 53 48 8
RIP: fscache_put_operation+0x1e3/0x1f0 RSP: ffffc9002ff8be08
---[ end trace e5ddade8fd0233c5 ]---

The assertion comes from fscache_put_operation():

ASSERTIFCMP(op->state != FSCACHE_OP_ST_INITIALISED &&
op->state != FSCACHE_OP_ST_COMPLETE,
op->state, ==, FSCACHE_OP_ST_CANCELLED);

It expects op->state to be either INITIALISED, COMPLETE, or CANCELLED,
but it is in fact DEAD.The only place that sets it to DEAD is
fscache_put_operation() itself (a few lines after this assert). So we
somehow have an operation that manages to have its refcount drop to 0
twice in a row -- without hitting the assertion that checks for a
positive refcount.

We can tell from the first stack trace that we were just in
fscache_enqueue_operation(), which has the following code:

atomic_inc(&op->usage);
if (!queue_work(fscache_op_wq, &op->work))
fscache_put_operation(op);

The reason we don't hit the positive refcount assertion is that the
atomic_inc() above probably brings it from 0 to 1.

It does smell a lot like a race causing premature free +
use-after-free explaining the weird diagnostics.

It takes about 15-20 minutes to reproduce the crash, with the patch
above it's gone, but we can test alternative patches.

Vegard

2018-06-15 18:03:58

by Anthony DeRobertis

[permalink] [raw]

Subject: Re: fscache: fix a kernel BUG at fs/fscache/operation.c:69!

On 02/22/2018 02:33 AM, [email protected] wrote:
> From: Lei Xue <[email protected]>
>
> There is a potential race in fscache operation enqueuing for reading and
> copying multiple pages from cachefiles to netfs.
> Under some heavy load system, it will happen very often.

Did anything happen with this patch? I still saw it with 4.16.12 (panic
below, captured via netconsole) when running ffmpeg with a source on a
cached NFSv3 share. (Actually an overlay on top of that cached NFS
mount). Applying this patch seems to have fixed the issue.

The only weird thing I've noticed — not sure it's actually new from this
patch — is that midway through the encode, it stopped using the cached
version and started transferring it from the server again. But even so,
that's far better than a panic that takes out the machine (until the
watchdog forces a reset).

ps: please cc me on replies, not subscribed to the mailing list.

[51593.345396] FS-Cache:
[51593.345406] FS-Cache: Assertion failed
[51593.345409] FS-Cache: 0 > 0 is false
[51593.345421] ------------[ cut here ]------------
[51593.345424] kernel BUG at /build/linux-43CEzF/linux-4.16.12/fs/fscache/operation.c:68!
[51593.345430] invalid opcode: 0000 [#1] SMP PTI
[51593.345433] Modules linked in: netconsole configfs ebtable_filter ebtables devlink ip6table_filter ip6_tables iptable_filter overlay nfsv3 nfs tun cachefiles fscache binfmt_misc snd_usb_audio snd_hda_codec_hdmi snd_usbmidi_lib xts dm_crypt hid_generic usbhid snd_rawmidi snd_seq_device hid pktcdvd snd_hda_codec_realtek snd_hda_codec_generic snd_hda_intel snd_hda_codec bridge snd_hda_core stp llc intel_powerclamp snd_hwdep kvm_intel kvm snd_pcm_oss mxm_wmi snd_mixer_oss snd_pcm snd_timer irqbypass intel_cstate uhci_hcd intel_uncore r8169 pcspkr ehci_pci xhci_pci xhci_hcd wmi snd ehci_hcd i2c_i801 firewire_ohci sr_mod soundcore usbcore firewire_core crc_itu_t cdrom mii sg button asus_atk0110 lpc_ich shpchp usb_common i5500_temp i7core_edac nvidia_drm(PO) drm_kms_helper drm nvidia_modeset(PO) nvidia(PO)
[51593.345468] ipmi_devintf ipmi_msghandler nfsd coretemp auth_rpcgss nfs_acl lockd grace sunrpc i2c_dev iTCO_wdt iTCO_vendor_support acpi_cpufreq msr loop ip_tables x_tables autofs4 ext4 crc16 mbcache jbd2 fscrypto ecb crypto_simd cryptd glue_helper aes_x86_64 dm_mod raid456 libcrc32c crc32c_generic async_raid6_recov async_memcpy async_pq async_xor xor async_tx raid6_pq raid0 multipath linear raid1 raid10 md_mod sd_mod ahci libahci crc32c_intel evdev serio_raw libata scsi_mod
[51593.345493] CPU: 6 PID: 0 Comm: swapper/6 Tainted: P W IO 4.16.0-2-amd64 #1 Debian 4.16.12-1
[51593.345495] Hardware name: System manufacturer System Product Name/SABERTOOTH X58, BIOS 1402 08/09/2012
[51593.345502] RIP: 0010:fscache_enqueue_operation+0x106/0x150 [fscache]
[51593.345504] RSP: 0018:ffff91a767383c98 EFLAGS: 00010082
[51593.345508] RAX: 0000000000000018 RBX: ffff91a7290d4d80 RCX: 0000000000000006
[51593.345510] RDX: 0000000000000000 RSI: 0000000000000086 RDI: ffff91a767396730
[51593.345512] RBP: ffff91a3359f8a20 R08: 000000000000043e R09: 0000000000000000
[51593.345515] R10: ffff91a767383c80 R11: 0000000000000001 R12: ffff91a7290d4d80
[51593.345517] R13: ffff91a7290d4e08 R14: ffff91a703575e20 R15: ffff91a7290d4e08
[51593.345520] FS: 0000000000000000(0000) GS:ffff91a767380000(0000) knlGS:0000000000000000
[51593.345522] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[51593.345524] CR2: 000055627b19b000 CR3: 000000015680a000 CR4: 00000000000006e0
[51593.345528] Call Trace:
[51593.345531] <IRQ>
[51593.345536] cachefiles_read_waiter+0xf8/0x150 [cachefiles]
[51593.345541] __wake_up_common+0x76/0x170
[51593.345544] wake_up_page_bit+0xac/0x100
[51593.345561] mpage_end_io+0x30/0x90 [ext4]
[51593.345568] clone_endio+0x8e/0x140 [dm_mod]
[51593.345572] raid_end_bio_io+0x2c/0xb0 [raid10]
[51593.345576] raid10_end_read_request+0x71/0x150 [raid10]
[51593.345581] blk_update_request+0x78/0x2b0
[51593.345589] scsi_end_request+0x2c/0x1e0 [scsi_mod]
[51593.345597] scsi_io_completion+0x477/0x670 [scsi_mod]
[51593.345600] blk_done_softirq+0xa0/0xd0
[51593.345605] __do_softirq+0xde/0x2b4
[51593.345609] irq_exit+0xae/0xb0
[51593.345612] do_IRQ+0x7d/0xc0
[51593.345615] common_interrupt+0xf/0xf
[51593.345617] </IRQ>
[51593.345622] RIP: 0010:cpuidle_enter_state+0xa0/0x2b0
[51593.345624] RSP: 0018:ffffb3f1431c7eb0 EFLAGS: 00000246 ORIG_RAX: ffffffffffffffdc
[51593.345627] RAX: ffff91a7673a18c0 RBX: 00002eec8305e1d8 RCX: 000000000000001f
[51593.345629] RDX: 00002eec8305e1d8 RSI: 000000002d9c529c RDI: 0000000000000000
[51593.345632] RBP: ffff91a7673a9948 R08: 0000000000000001 R09: 0000000000000135
[51593.345634] R10: ffffb3f1431c7e90 R11: 00000000000000a0 R12: 0000000000000003
[51593.345636] R13: ffffffff9cab2cd8 R14: 0000000000000000 R15: 00002eec83032f50
[51593.345641] do_idle+0x183/0x1e0
[51593.345644] cpu_startup_entry+0x6f/0x80
[51593.345649] start_secondary+0x1a4/0x1f0
[51593.345652] secondary_startup_64+0xa5/0xb0
[51593.345655] Code: 0b 48 c7 c7 63 d9 7d c1 e8 72 dd 2f da 48 c7 c7 71 d9 7d c1 e8 66 dd 2f da 48 63 73 44 31 d2 48 c7 c7 a0 ce 7d c1 e8 54 dd 2f da <0f> 0b 48 c7 c7 63 d9 7d c1 e8 46 dd 2f da 48 c7 c7 71 d9 7d c1
[51593.345682] RIP: fscache_enqueue_operation+0x106/0x150 [fscache] RSP: ffff91a767383c98
[51593.345688] ---[ end trace fe061122063cafae ]---
[51593.345690] Kernel panic - not syncing: Fatal exception in interrupt
[51593.345782] Kernel Offset: 0x1aa00000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff)
[51593.345786] ---[ end Kernel panic - not syncing: Fatal exception in interrupt
[51593.345793] WARNING: CPU: 6 PID: 0 at /build/linux-43CEzF/linux-4.16.12/kernel/sched/core.c:1190 set_task_cpu+0x14d/0x160
[51593.345795] Modules linked in: netconsole configfs ebtable_filter ebtables devlink ip6table_filter ip6_tables iptable_filter overlay nfsv3 nfs tun cachefiles fscache binfmt_misc snd_usb_audio snd_hda_codec_hdmi snd_usbmidi_lib xts dm_crypt hid_generic usbhid snd_rawmidi snd_seq_device hid pktcdvd snd_hda_codec_realtek snd_hda_codec_generic snd_hda_intel snd_hda_codec bridge snd_hda_core stp llc intel_powerclamp snd_hwdep kvm_intel kvm snd_pcm_oss mxm_wmi snd_mixer_oss snd_pcm snd_timer irqbypass intel_cstate uhci_hcd intel_uncore r8169 pcspkr ehci_pci xhci_pci xhci_hcd wmi snd ehci_hcd i2c_i801 firewire_ohci sr_mod soundcore usbcore firewire_core crc_itu_t cdrom mii sg button asus_atk0110 lpc_ich shpchp usb_common i5500_temp i7core_edac nvidia_drm(PO) drm_kms_helper drm nvidia_modeset(PO) nvidia(PO)
[51593.345823] ipmi_devintf ipmi_msghandler nfsd coretemp auth_rpcgss nfs_acl lockd grace sunrpc i2c_dev iTCO_wdt iTCO_vendor_support acpi_cpufreq msr loop ip_tables x_tables autofs4 ext4 crc16 mbcache jbd2 fscrypto ecb crypto_simd cryptd glue_helper aes_x86_64 dm_mod raid456 libcrc32c crc32c_generic async_raid6_recov async_memcpy async_pq async_xor xor async_tx raid6_pq raid0 multipath linear raid1 raid10 md_mod sd_mod ahci libahci crc32c_intel evdev serio_raw libata scsi_mod
[51593.345844] CPU: 6 PID: 0 Comm: swapper/6 Tainted: P D W IO 4.16.0-2-amd64 #1 Debian 4.16.12-1
[51593.345846] Hardware name: System manufacturer System Product Name/SABERTOOTH X58, BIOS 1402 08/09/2012
[51593.345849] RIP: 0010:set_task_cpu+0x14d/0x160
[51593.345851] RSP: 0018:ffff91a7673837f0 EFLAGS: 00010006
[51593.345853] RAX: 0000000000000200 RBX: ffff91a74e31c380 RCX: 0000000000000000
[51593.345855] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff91a74e31c380
[51593.345857] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000006
[51593.345859] R10: 0000000000000007 R11: 0000000000000000 R12: 0000000000000000
[51593.345861] R13: 0000000000000000 R14: 0000000000000046 R15: 00000000000218c0
[51593.345864] FS: 0000000000000000(0000) GS:ffff91a767380000(0000) knlGS:0000000000000000
[51593.345866] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[51593.345868] CR2: 000055627b19b000 CR3: 000000015680a000 CR4: 00000000000006e0
[51593.345870] Call Trace:
[51593.345872] <IRQ>
[51593.345875] try_to_wake_up+0x154/0x460
[51593.345879] autoremove_wake_function+0x11/0x50
[51593.345881] __wake_up_common+0x76/0x170
[51593.345884] __wake_up_common_lock+0x7c/0xc0
[51593.345888] irq_work_run_list+0x4d/0x70
[51593.345891] smp_irq_work_interrupt+0x32/0xc0
[51593.345894] irq_work_interrupt+0xf/0x20
[51593.345898] RIP: 0010:panic+0x1fb/0x252
[51593.345900] RSP: 0018:ffff91a767383a48 EFLAGS: 00000286 ORIG_RAX: ffffffffffffff09
[51593.345903] RAX: 0000000000000041 RBX: 0000000000000000 RCX: 0000000000000006
[51593.345905] RDX: 0000000000000000 RSI: 0000000000000082 RDI: ffff91a767396730
[51593.345908] RBP: ffff91a767383ac0 R08: 0000000000000472 R09: 0000000000000000
[51593.345910] R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000
[51593.345912] R13: 0000000000000000 R14: ffffffff9c80922d R15: 0000000000000004
[51593.345915] ? irq_work_interrupt+0xa/0x20
[51593.345919] ? panic+0x1f7/0x252
[51593.345923] oops_end+0xac/0xc0
[51593.345927] do_trap+0x132/0x140
[51593.345930] do_error_trap+0x9d/0x120
[51593.345934] ? fscache_enqueue_operation+0x106/0x150 [fscache]
[51593.345938] ? up+0x12/0x60
[51593.345941] ? console_unlock+0x29d/0x550
[51593.345944] invalid_op+0x1b/0x40
[51593.345949] RIP: 0010:fscache_enqueue_operation+0x106/0x150 [fscache]

2018-07-04 00:11:51

by NeilBrown

[permalink] [raw]

Subject: [PATCH] cachefiles: fix multiple-put race.

cachefiles_read_waiter() owns a reference to a 'monitor'
object and adds the object to a 'to_do' list *before*
calling fscache_enqueue_retrieval() on it.

The reference is passed to the to_do list, so
cachefiles_read_waiter() no longer owns a reference and shouldn't be
accessing the monitor.
cachefiles_read_copier(), which handles the to_do list, might call
fscache_put_retrieval() *before* fscache_enqueue_retrieval()
takes its own reference.

This can result in fscache_put_retrieval() trying to discard the op
twice, which triggers an assertion failure, and can result in freed
memory be accessed.

The former looks like:

FS-Cache:
FS-Cache: Assertion failed
FS-Cache: 6 == 5 is false
------------[ cut here ]------------
kernel BUG at fs/fscache/operation.c:494!

A previous patch from Lei Xue moved the fscache_enqueue_retrieval()
call inside the spin_locked region. I think it is cleaner to move it
before.

Reported-by: Lei Xue <[email protected]>
Reported-by: Vegard Nossum <[email protected]>
Reported-by: Anthony DeRobertis <[email protected]>
Signed-off-by: NeilBrown <[email protected]>
---

hi Andrew,
this issue was first mentioned in
https://lkml.org/lkml/2018/2/22/82
in February, but David Howells doesn't seem to have responded.
Can you take the patch?

Thanks,
NeilBrown

fs/cachefiles/rdwr.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 5082c8a49686..553a71a2c9cb 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -56,11 +56,12 @@ static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode,
object = container_of(monitor->op->op.object,
struct cachefiles_object, fscache);

+ fscache_enqueue_retrieval(monitor->op);
+
spin_lock(&object->work_lock);
list_add_tail(&monitor->op_link, &monitor->op->to_do);
spin_unlock(&object->work_lock);

- fscache_enqueue_retrieval(monitor->op);
return 0;
}

--
2.14.0.rc0.dirty

Attachments:

signature.asc (847.00 B)

2018-07-04 09:15:57

by David Howells

[permalink] [raw]

Subject: Re: [PATCH] cachefiles: fix multiple-put race.

NeilBrown <[email protected]> wrote:

> + fscache_enqueue_retrieval(monitor->op);
> +
> spin_lock(&object->work_lock);
> list_add_tail(&monitor->op_link, &monitor->op->to_do);
> spin_unlock(&object->work_lock);
>
> - fscache_enqueue_retrieval(monitor->op);

That won't necessarily work because the work processor can then happen before
you've added the work to the to_do list.

I'm thinking that KiranKumar's solution might be the best one.

The problem is that cachefiles_read_waiter() doesn't have a ref on the monitor
object but is entirely dependent on the waitqueue lock for safety. I think
KiranKumar's patch is correct to take a ref before doing the queuing. It
might be possible to then pass this along to the work processor, but that
might be too fiddly.

Actually, I want to get rid of the page monitoring stuff entirely as it's
quite fragile and use an iterator and direct-IO instead, but we have to fix
this for now.

David

2018-07-04 12:30:31

by David Howells

[permalink] [raw]

Subject: [PATCH] cachefiles: Fix assertion "6 == 5 is false" at fs/fscache/operation.c:494

So something like the attached. Possibly the changes to operation.c should be
split into a separate patch.

David
---
commit f3d49bdba2756bda41d18280b34f04c80f7f324c
Author: kiran modukuri <[email protected]>
Date: Tue Jul 18 16:25:49 2017 -0700

cachefiles: Fix assertion "6 == 5 is false" at fs/fscache/operation.c:494

There is a potential race in fscache operation enqueuing for reading and
copying multiple pages from cachefiles to netfs.
Under some heavy load system, it will happen very often.

If this race occurs, an oops similar to the following is seen:

kernel BUG at fs/fscache/operation.c:69!
invalid opcode: 0000 [#1] SMP
...
#0 [ffff883fff0838d8] machine_kexec at ffffffff81051beb
#1 [ffff883fff083938] crash_kexec at ffffffff810f2542
#2 [ffff883fff083a08] oops_end at ffffffff8163e1a8
#3 [ffff883fff083a30] die at ffffffff8101859b
#4 [ffff883fff083a60] do_trap at ffffffff8163d860
#5 [ffff883fff083ab0] do_invalid_op at ffffffff81015204
#6 [ffff883fff083b60] invalid_op at ffffffff8164701e
[exception RIP: fscache_enqueue_operation+246]
RIP: ffffffffa0b793c6 RSP: ffff883fff083c18 RFLAGS: 00010046
RAX: 0000000000000019 RBX: ffff8832ed1a9ec0 RCX: 0000000000000006
RDX: 0000000000000000 RSI: 0000000000000046 RDI: 0000000000000046
RBP: ffff883fff083c20 R8: 0000000000000086 R9: 000000000000178f
R10: ffffffff816aeb00 R11: ffff883fff08392e R12: ffff8802f0525620
R13: ffff88407ffc01d8 R14: 0000000000000000 R15: 0000000000000003
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0000
#7 [ffff883fff083c10] fscache_enqueue_operation at ffffffffa0b793c6
#8 [ffff883fff083c28] cachefiles_read_waiter at ffffffffa0b15a48
#9 [ffff883fff083c48] __wake_up_common at ffffffff810af028

Reported-by: Lei Xue <[email protected]>
Reported-by: Vegard Nossum <[email protected]>
Reported-by: Anthony DeRobertis <[email protected]>
Reported-by: NeilBrown <[email protected]>
Reported-by: Daniel Axtens <[email protected]>
Reported-by: KiranKumar Modukuri <[email protected]>
Signed-off-by: David Howells <[email protected]>

diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 5082c8a49686..b2cb6cdcd87f 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -27,6 +27,7 @@ static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode,
struct cachefiles_one_read *monitor =
container_of(wait, struct cachefiles_one_read, monitor);
struct cachefiles_object *object;
+ struct fscache_retrieval *op = monitor->op;
struct wait_bit_key *key = _key;
struct page *page = wait->private;

@@ -51,16 +52,22 @@ static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode,
list_del(&wait->entry);

/* move onto the action list and queue for FS-Cache thread pool */
- ASSERT(monitor->op);
+ ASSERT(op);

- object = container_of(monitor->op->op.object,
- struct cachefiles_object, fscache);
+ /* We need to temporarily bump the usage count as we don't own a ref
+ * here otherwise cachefiles_read_copier() may free the op between the
+ * monitor being enqueued on the op->to_do list and the op getting
+ * enqueued on the work queue.
+ */
+ fscache_get_retrieval(op);

+ object = container_of(op->op.object, struct cachefiles_object, fscache);
spin_lock(&object->work_lock);
list_add_tail(&monitor->op_link, &monitor->op->to_do);
spin_unlock(&object->work_lock);

fscache_enqueue_retrieval(monitor->op);
+ fscache_put_retrieval(op);
return 0;
}

diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index e30c5975ea58..28ce646cc32c 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -69,8 +69,9 @@ void fscache_enqueue_operation(struct fscache_operation *op)
ASSERT(list_empty(&op->pend_link));
ASSERT(op->processor != NULL);
ASSERT(fscache_object_is_available(op->object));
- ASSERTCMP(atomic_read(&op->usage), >, 0);
- ASSERTCMP(op->state, ==, FSCACHE_OP_ST_IN_PROGRESS);
+ ASSERTCMP(atomic_read(&op->usage), >, 1);
+ ASSERTIFCMP(op->state != FSCACHE_OP_ST_IN_PROGRESS,
+ op->state, ==, FSCACHE_OP_ST_CANCELLED);

fscache_stat(&fscache_n_op_enqueue);
switch (op->flags & FSCACHE_OP_TYPE) {
@@ -499,7 +500,8 @@ void fscache_put_operation(struct fscache_operation *op)
struct fscache_cache *cache;

_enter("{OBJ%x OP%x,%d}",
- op->object->debug_id, op->debug_id, atomic_read(&op->usage));
+ op->object ? op->object->debug_id : 0,
+ op->debug_id, atomic_read(&op->usage));

ASSERTCMP(atomic_read(&op->usage), >, 0);

2018-07-04 13:31:24

by David Howells

[permalink] [raw]

Subject: Re: [PATCH] cachefiles: Fix assertion "6 == 5 is false" at fs/fscache/operation.c:494

David Howells <[email protected]> wrote:

> So something like the attached. Possibly the changes to operation.c should be
> split into a separate patch.

Bah. Helps if I try committing all of my changes first. Revised patch
attached.

David.
---
commit a0d29054c1c7bf575d73cced446dec6fcba30e0d
Author: kiran modukuri <[email protected]>
Date: Tue Jul 18 16:25:49 2017 -0700

cachefiles: Fix assertion "6 == 5 is false" at fs/fscache/operation.c:494

There is a potential race in fscache operation enqueuing for reading and
copying multiple pages from cachefiles to netfs.
Under some heavy load system, it will happen very often.

If this race occurs, an oops similar to the following is seen:

kernel BUG at fs/fscache/operation.c:69!
invalid opcode: 0000 [#1] SMP
...
#0 [ffff883fff0838d8] machine_kexec at ffffffff81051beb
#1 [ffff883fff083938] crash_kexec at ffffffff810f2542
#2 [ffff883fff083a08] oops_end at ffffffff8163e1a8
#3 [ffff883fff083a30] die at ffffffff8101859b
#4 [ffff883fff083a60] do_trap at ffffffff8163d860
#5 [ffff883fff083ab0] do_invalid_op at ffffffff81015204
#6 [ffff883fff083b60] invalid_op at ffffffff8164701e
[exception RIP: fscache_enqueue_operation+246]
RIP: ffffffffa0b793c6 RSP: ffff883fff083c18 RFLAGS: 00010046
RAX: 0000000000000019 RBX: ffff8832ed1a9ec0 RCX: 0000000000000006
RDX: 0000000000000000 RSI: 0000000000000046 RDI: 0000000000000046
RBP: ffff883fff083c20 R8: 0000000000000086 R9: 000000000000178f
R10: ffffffff816aeb00 R11: ffff883fff08392e R12: ffff8802f0525620
R13: ffff88407ffc01d8 R14: 0000000000000000 R15: 0000000000000003
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0000
#7 [ffff883fff083c10] fscache_enqueue_operation at ffffffffa0b793c6
#8 [ffff883fff083c28] cachefiles_read_waiter at ffffffffa0b15a48
#9 [ffff883fff083c48] __wake_up_common at ffffffff810af028

Reported-by: Lei Xue <[email protected]>
Reported-by: Vegard Nossum <[email protected]>
Reported-by: Anthony DeRobertis <[email protected]>
Reported-by: NeilBrown <[email protected]>
Reported-by: Daniel Axtens <[email protected]>
Reported-by: KiranKumar Modukuri <[email protected]>
Signed-off-by: David Howells <[email protected]>

diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 5082c8a49686..40f7595aad10 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -27,6 +27,7 @@ static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode,
struct cachefiles_one_read *monitor =
container_of(wait, struct cachefiles_one_read, monitor);
struct cachefiles_object *object;
+ struct fscache_retrieval *op = monitor->op;
struct wait_bit_key *key = _key;
struct page *page = wait->private;

@@ -51,16 +52,22 @@ static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode,
list_del(&wait->entry);

/* move onto the action list and queue for FS-Cache thread pool */
- ASSERT(monitor->op);
+ ASSERT(op);

- object = container_of(monitor->op->op.object,
- struct cachefiles_object, fscache);
+ /* We need to temporarily bump the usage count as we don't own a ref
+ * here otherwise cachefiles_read_copier() may free the op between the
+ * monitor being enqueued on the op->to_do list and the op getting
+ * enqueued on the work queue.
+ */
+ fscache_get_retrieval(op);

+ object = container_of(op->op.object, struct cachefiles_object, fscache);
spin_lock(&object->work_lock);
- list_add_tail(&monitor->op_link, &monitor->op->to_do);
+ list_add_tail(&monitor->op_link, &op->to_do);
spin_unlock(&object->work_lock);

- fscache_enqueue_retrieval(monitor->op);
+ fscache_enqueue_retrieval(op);
+ fscache_put_retrieval(op);
return 0;
}

diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index e30c5975ea58..8d265790374c 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -70,7 +70,8 @@ void fscache_enqueue_operation(struct fscache_operation *op)
ASSERT(op->processor != NULL);
ASSERT(fscache_object_is_available(op->object));
ASSERTCMP(atomic_read(&op->usage), >, 0);
- ASSERTCMP(op->state, ==, FSCACHE_OP_ST_IN_PROGRESS);
+ ASSERTIFCMP(op->state != FSCACHE_OP_ST_IN_PROGRESS,
+ op->state, ==, FSCACHE_OP_ST_CANCELLED);

fscache_stat(&fscache_n_op_enqueue);
switch (op->flags & FSCACHE_OP_TYPE) {
@@ -499,7 +500,8 @@ void fscache_put_operation(struct fscache_operation *op)
struct fscache_cache *cache;

_enter("{OBJ%x OP%x,%d}",
- op->object->debug_id, op->debug_id, atomic_read(&op->usage));
+ op->object ? op->object->debug_id : 0,
+ op->debug_id, atomic_read(&op->usage));

ASSERTCMP(atomic_read(&op->usage), >, 0);