2016-11-14 17:48:45

by Peter Zijlstra

[permalink] [raw]
Subject: [RFC][PATCH 2/7] kref: Add kref_read()

Since we need to change the implementation, stop exposing internals.

Provide kref_read() to read the current reference count; typically
used for debug messages.

Kills two anti-patterns:

atomic_read(&kref->refcount)
kref->refcount.counter

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
drivers/block/drbd/drbd_req.c | 2 -
drivers/block/rbd.c | 8 ++---
drivers/block/virtio_blk.c | 2 -
drivers/gpu/drm/drm_gem_cma_helper.c | 2 -
drivers/gpu/drm/drm_info.c | 2 -
drivers/gpu/drm/drm_mode_object.c | 4 +-
drivers/gpu/drm/etnaviv/etnaviv_gem.c | 2 -
drivers/gpu/drm/msm/msm_gem.c | 2 -
drivers/gpu/drm/nouveau/nouveau_fence.c | 2 -
drivers/gpu/drm/omapdrm/omap_gem.c | 2 -
drivers/gpu/drm/ttm/ttm_bo.c | 4 +-
drivers/gpu/drm/ttm/ttm_object.c | 2 -
drivers/infiniband/hw/cxgb3/iwch_cm.h | 6 ++--
drivers/infiniband/hw/cxgb3/iwch_qp.c | 2 -
drivers/infiniband/hw/cxgb4/iw_cxgb4.h | 6 ++--
drivers/infiniband/hw/cxgb4/qp.c | 2 -
drivers/infiniband/hw/usnic/usnic_ib_sysfs.c | 6 ++--
drivers/infiniband/hw/usnic/usnic_ib_verbs.c | 4 +-
drivers/misc/genwqe/card_dev.c | 2 -
drivers/misc/mei/debugfs.c | 2 -
drivers/pci/hotplug/pnv_php.c | 2 -
drivers/pci/slot.c | 2 -
drivers/scsi/bnx2fc/bnx2fc_io.c | 8 ++---
drivers/scsi/cxgbi/libcxgbi.h | 4 +-
drivers/scsi/lpfc/lpfc_debugfs.c | 2 -
drivers/scsi/lpfc/lpfc_els.c | 2 -
drivers/scsi/lpfc/lpfc_hbadisc.c | 40 +++++++++++++--------------
drivers/scsi/lpfc/lpfc_init.c | 3 --
drivers/scsi/qla2xxx/tcm_qla2xxx.c | 4 +-
drivers/staging/android/ion/ion.c | 2 -
drivers/staging/comedi/comedi_buf.c | 2 -
drivers/target/target_core_pr.c | 10 +++---
drivers/target/tcm_fc/tfc_sess.c | 2 -
drivers/usb/gadget/function/f_fs.c | 2 -
fs/exofs/sys.c | 2 -
fs/ocfs2/cluster/netdebug.c | 2 -
fs/ocfs2/cluster/tcp.c | 2 -
fs/ocfs2/dlm/dlmdebug.c | 12 ++++----
fs/ocfs2/dlm/dlmdomain.c | 2 -
fs/ocfs2/dlm/dlmmaster.c | 8 ++---
fs/ocfs2/dlm/dlmunlock.c | 2 -
include/drm/drm_framebuffer.h | 2 -
include/drm/ttm/ttm_bo_driver.h | 4 +-
include/linux/kref.h | 5 +++
include/linux/sunrpc/cache.h | 2 -
include/net/bluetooth/hci_core.h | 4 +-
net/bluetooth/6lowpan.c | 2 -
net/bluetooth/a2mp.c | 4 +-
net/bluetooth/amp.c | 4 +-
net/bluetooth/l2cap_core.c | 4 +-
net/ceph/messenger.c | 4 +-
net/ceph/osd_client.c | 10 +++---
net/sunrpc/cache.c | 2 -
net/sunrpc/svc_xprt.c | 6 ++--
net/sunrpc/xprtrdma/svc_rdma_transport.c | 4 +-
55 files changed, 120 insertions(+), 116 deletions(-)

--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -520,7 +520,7 @@ static void mod_rq_state(struct drbd_req
/* Completion does it's own kref_put. If we are going to
* kref_sub below, we need req to be still around then. */
int at_least = k_put + !!c_put;
- int refcount = atomic_read(&req->kref.refcount);
+ int refcount = kref_read(&req->kref);
if (refcount < at_least)
drbd_err(device,
"mod_rq_state: Logic BUG: %x -> %x: refcount = %d, should be >= %d\n",
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1535,7 +1535,7 @@ static bool obj_request_overlaps_parent(
static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
{
dout("%s: obj %p (was %d)\n", __func__, obj_request,
- atomic_read(&obj_request->kref.refcount));
+ kref_read(&obj_request->kref));
kref_get(&obj_request->kref);
}

@@ -1544,14 +1544,14 @@ static void rbd_obj_request_put(struct r
{
rbd_assert(obj_request != NULL);
dout("%s: obj %p (was %d)\n", __func__, obj_request,
- atomic_read(&obj_request->kref.refcount));
+ kref_read(&obj_request->kref));
kref_put(&obj_request->kref, rbd_obj_request_destroy);
}

static void rbd_img_request_get(struct rbd_img_request *img_request)
{
dout("%s: img %p (was %d)\n", __func__, img_request,
- atomic_read(&img_request->kref.refcount));
+ kref_read(&img_request->kref));
kref_get(&img_request->kref);
}

@@ -1562,7 +1562,7 @@ static void rbd_img_request_put(struct r
{
rbd_assert(img_request != NULL);
dout("%s: img %p (was %d)\n", __func__, img_request,
- atomic_read(&img_request->kref.refcount));
+ kref_read(&img_request->kref));
if (img_request_child_test(img_request))
kref_put(&img_request->kref, rbd_parent_request_destroy);
else
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -767,7 +767,7 @@ static void virtblk_remove(struct virtio
/* Stop all the virtqueues. */
vdev->config->reset(vdev);

- refc = atomic_read(&disk_to_dev(vblk->disk)->kobj.kref.refcount);
+ refc = kref_read(&disk_to_dev(vblk->disk)->kobj.kref);
put_disk(vblk->disk);
vdev->config->del_vqs(vdev);
kfree(vblk->vqs);
--- a/drivers/gpu/drm/drm_gem_cma_helper.c
+++ b/drivers/gpu/drm/drm_gem_cma_helper.c
@@ -376,7 +376,7 @@ void drm_gem_cma_describe(struct drm_gem
off = drm_vma_node_start(&obj->vma_node);

seq_printf(m, "%2d (%2d) %08llx %pad %p %zu",
- obj->name, obj->refcount.refcount.counter,
+ obj->name, kref_read(&obj->refcount),
off, &cma_obj->paddr, cma_obj->vaddr, obj->size);

seq_printf(m, "\n");
--- a/drivers/gpu/drm/drm_info.c
+++ b/drivers/gpu/drm/drm_info.c
@@ -118,7 +118,7 @@ static int drm_gem_one_name_info(int id,
seq_printf(m, "%6d %8zd %7d %8d\n",
obj->name, obj->size,
obj->handle_count,
- atomic_read(&obj->refcount.refcount));
+ kref_read(&obj->refcount));
return 0;
}

--- a/drivers/gpu/drm/drm_mode_object.c
+++ b/drivers/gpu/drm/drm_mode_object.c
@@ -159,7 +159,7 @@ EXPORT_SYMBOL(drm_mode_object_find);
void drm_mode_object_unreference(struct drm_mode_object *obj)
{
if (obj->free_cb) {
- DRM_DEBUG("OBJ ID: %d (%d)\n", obj->id, atomic_read(&obj->refcount.refcount));
+ DRM_DEBUG("OBJ ID: %d (%d)\n", obj->id, kref_read(&obj->refcount));
kref_put(&obj->refcount, obj->free_cb);
}
}
@@ -176,7 +176,7 @@ EXPORT_SYMBOL(drm_mode_object_unreferenc
void drm_mode_object_reference(struct drm_mode_object *obj)
{
if (obj->free_cb) {
- DRM_DEBUG("OBJ ID: %d (%d)\n", obj->id, atomic_read(&obj->refcount.refcount));
+ DRM_DEBUG("OBJ ID: %d (%d)\n", obj->id, kref_read(&obj->refcount));
kref_get(&obj->refcount);
}
}
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
@@ -491,7 +491,7 @@ static void etnaviv_gem_describe(struct

seq_printf(m, "%08x: %c %2d (%2d) %08lx %p %zd\n",
etnaviv_obj->flags, is_active(etnaviv_obj) ? 'A' : 'I',
- obj->name, obj->refcount.refcount.counter,
+ obj->name, kref_read(&obj->refcount),
off, etnaviv_obj->vaddr, obj->size);

rcu_read_lock();
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -652,7 +652,7 @@ void msm_gem_describe(struct drm_gem_obj

seq_printf(m, "%08x: %c %2d (%2d) %08llx %p %zu%s\n",
msm_obj->flags, is_active(msm_obj) ? 'A' : 'I',
- obj->name, obj->refcount.refcount.counter,
+ obj->name, kref_read(&obj->refcount),
off, msm_obj->vaddr, obj->size, madv);

rcu_read_lock();
--- a/drivers/gpu/drm/nouveau/nouveau_fence.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
@@ -527,7 +527,7 @@ static bool nouveau_fence_no_signaling(s
* caller should have a reference on the fence,
* else fence could get freed here
*/
- WARN_ON(atomic_read(&fence->base.refcount.refcount) <= 1);
+ WARN_ON(kref_read(&fence->base.refcount) <= 1);

/*
* This needs uevents to work correctly, but fence_add_callback relies on
--- a/drivers/gpu/drm/omapdrm/omap_gem.c
+++ b/drivers/gpu/drm/omapdrm/omap_gem.c
@@ -1035,7 +1035,7 @@ void omap_gem_describe(struct drm_gem_ob
off = drm_vma_node_start(&obj->vma_node);

seq_printf(m, "%08x: %2d (%2d) %08llx %pad (%2d) %p %4d",
- omap_obj->flags, obj->name, obj->refcount.refcount.counter,
+ omap_obj->flags, obj->name, kref_read(&obj->refcount),
off, &omap_obj->paddr, omap_obj->paddr_cnt,
omap_obj->vaddr, omap_obj->roll);

--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -140,8 +140,8 @@ static void ttm_bo_release_list(struct k
struct ttm_bo_device *bdev = bo->bdev;
size_t acc_size = bo->acc_size;

- BUG_ON(atomic_read(&bo->list_kref.refcount));
- BUG_ON(atomic_read(&bo->kref.refcount));
+ BUG_ON(kref_read(&bo->list_kref));
+ BUG_ON(kref_read(&bo->kref));
BUG_ON(atomic_read(&bo->cpu_writers));
BUG_ON(bo->mem.mm_node != NULL);
BUG_ON(!list_empty(&bo->lru));
--- a/drivers/gpu/drm/ttm/ttm_object.c
+++ b/drivers/gpu/drm/ttm/ttm_object.c
@@ -304,7 +304,7 @@ bool ttm_ref_object_exists(struct ttm_ob
* Verify that the ref->obj pointer was actually valid!
*/
rmb();
- if (unlikely(atomic_read(&ref->kref.refcount) == 0))
+ if (unlikely(kref_read(&ref->kref) == 0))
goto out_false;

rcu_read_unlock();
--- a/drivers/infiniband/hw/cxgb3/iwch_cm.h
+++ b/drivers/infiniband/hw/cxgb3/iwch_cm.h
@@ -55,14 +55,14 @@

#define put_ep(ep) { \
PDBG("put_ep (via %s:%u) ep %p refcnt %d\n", __func__, __LINE__, \
- ep, atomic_read(&((ep)->kref.refcount))); \
- WARN_ON(atomic_read(&((ep)->kref.refcount)) < 1); \
+ ep, kref_read(&((ep)->kref))); \
+ WARN_ON(kref_read(&((ep)->kref)) < 1); \
kref_put(&((ep)->kref), __free_ep); \
}

#define get_ep(ep) { \
PDBG("get_ep (via %s:%u) ep %p, refcnt %d\n", __func__, __LINE__, \
- ep, atomic_read(&((ep)->kref.refcount))); \
+ ep, kref_read(&((ep)->kref))); \
kref_get(&((ep)->kref)); \
}

--- a/drivers/infiniband/hw/cxgb3/iwch_qp.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_qp.c
@@ -961,7 +961,7 @@ int iwch_modify_qp(struct iwch_dev *rhp,
case IWCH_QP_STATE_RTS:
switch (attrs->next_state) {
case IWCH_QP_STATE_CLOSING:
- BUG_ON(atomic_read(&qhp->ep->com.kref.refcount) < 2);
+ BUG_ON(kref_read(&qhp->ep->com.kref) < 2);
qhp->attr.state = IWCH_QP_STATE_CLOSING;
if (!internal) {
abort=0;
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -654,14 +654,14 @@ enum c4iw_mmid_state {

#define c4iw_put_ep(ep) { \
PDBG("put_ep (via %s:%u) ep %p refcnt %d\n", __func__, __LINE__, \
- ep, atomic_read(&((ep)->kref.refcount))); \
- WARN_ON(atomic_read(&((ep)->kref.refcount)) < 1); \
+ ep, kref_read(&((ep)->kref))); \
+ WARN_ON(kref_read(&((ep)->kref)) < 1); \
kref_put(&((ep)->kref), _c4iw_free_ep); \
}

#define c4iw_get_ep(ep) { \
PDBG("get_ep (via %s:%u) ep %p, refcnt %d\n", __func__, __LINE__, \
- ep, atomic_read(&((ep)->kref.refcount))); \
+ ep, kref_read(&((ep)->kref))); \
kref_get(&((ep)->kref)); \
}
void _c4iw_free_ep(struct kref *kref);
--- a/drivers/infiniband/hw/cxgb4/qp.c
+++ b/drivers/infiniband/hw/cxgb4/qp.c
@@ -1498,7 +1498,7 @@ int c4iw_modify_qp(struct c4iw_dev *rhp,
case C4IW_QP_STATE_RTS:
switch (attrs->next_state) {
case C4IW_QP_STATE_CLOSING:
- BUG_ON(atomic_read(&qhp->ep->com.kref.refcount) < 2);
+ BUG_ON(kref_read(&qhp->ep->com.kref) < 2);
t4_set_wq_in_error(&qhp->wq);
set_state(qhp, C4IW_QP_STATE_CLOSING);
ep = qhp->ep;
--- a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c
@@ -80,7 +80,7 @@ usnic_ib_show_config(struct device *devi
left = PAGE_SIZE;

mutex_lock(&us_ibdev->usdev_lock);
- if (atomic_read(&us_ibdev->vf_cnt.refcount) > 0) {
+ if (kref_read(&us_ibdev->vf_cnt) > 0) {
char *busname;

/*
@@ -99,7 +99,7 @@ usnic_ib_show_config(struct device *devi
PCI_FUNC(us_ibdev->pdev->devfn),
netdev_name(us_ibdev->netdev),
us_ibdev->ufdev->mac,
- atomic_read(&us_ibdev->vf_cnt.refcount));
+ kref_read(&us_ibdev->vf_cnt));
UPDATE_PTR_LEFT(n, ptr, left);

for (res_type = USNIC_VNIC_RES_TYPE_EOL;
@@ -147,7 +147,7 @@ usnic_ib_show_max_vf(struct device *devi
us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev);

return scnprintf(buf, PAGE_SIZE, "%u\n",
- atomic_read(&us_ibdev->vf_cnt.refcount));
+ kref_read(&us_ibdev->vf_cnt));
}

static ssize_t
--- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
@@ -291,11 +291,11 @@ int usnic_ib_query_device(struct ib_devi
qp_per_vf = max(us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_WQ],
us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_RQ]);
props->max_qp = qp_per_vf *
- atomic_read(&us_ibdev->vf_cnt.refcount);
+ kref_read(&us_ibdev->vf_cnt);
props->device_cap_flags = IB_DEVICE_PORT_ACTIVE_EVENT |
IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
props->max_cq = us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_CQ] *
- atomic_read(&us_ibdev->vf_cnt.refcount);
+ kref_read(&us_ibdev->vf_cnt);
props->max_pd = USNIC_UIOM_MAX_PD_CNT;
props->max_mr = USNIC_UIOM_MAX_MR_CNT;
props->local_ca_ack_delay = 0;
--- a/drivers/misc/genwqe/card_dev.c
+++ b/drivers/misc/genwqe/card_dev.c
@@ -1396,7 +1396,7 @@ int genwqe_device_remove(struct genwqe_d
* application which will decrease this reference from
* 1/unused to 0/illegal and not from 2/used 1/empty.
*/
- rc = atomic_read(&cd->cdev_genwqe.kobj.kref.refcount);
+ rc = kref_read(&cd->cdev_genwqe.kobj.kref);
if (rc != 1) {
dev_err(&pci_dev->dev,
"[%s] err: cdev_genwqe...refcount=%d\n", __func__, rc);
--- a/drivers/misc/mei/debugfs.c
+++ b/drivers/misc/mei/debugfs.c
@@ -67,7 +67,7 @@ static ssize_t mei_dbgfs_read_meclients(
me_cl->props.max_number_of_connections,
me_cl->props.max_msg_length,
me_cl->props.single_recv_buf,
- atomic_read(&me_cl->refcnt.refcount));
+ kref_read(&me_cl->refcnt));

mei_me_cl_put(me_cl);
}
--- a/drivers/pci/hotplug/pnv_php.c
+++ b/drivers/pci/hotplug/pnv_php.c
@@ -155,7 +155,7 @@ static void pnv_php_detach_device_nodes(
pnv_php_detach_device_nodes(dn);

of_node_put(dn);
- refcount = atomic_read(&dn->kobj.kref.refcount);
+ refcount = kref_read(&dn->kobj.kref);
if (refcount != 1)
pr_warn("Invalid refcount %d on <%s>\n",
refcount, of_node_full_name(dn));
--- a/drivers/pci/slot.c
+++ b/drivers/pci/slot.c
@@ -345,7 +345,7 @@ EXPORT_SYMBOL_GPL(pci_create_slot);
void pci_destroy_slot(struct pci_slot *slot)
{
dev_dbg(&slot->bus->dev, "dev %02x, dec refcount to %d\n",
- slot->number, atomic_read(&slot->kobj.kref.refcount) - 1);
+ slot->number, kref_read(&slot->kobj.kref) - 1);

mutex_lock(&pci_slot_mutex);
kobject_put(&slot->kobj);
--- a/drivers/scsi/bnx2fc/bnx2fc_io.c
+++ b/drivers/scsi/bnx2fc/bnx2fc_io.c
@@ -74,7 +74,7 @@ static void bnx2fc_cmd_timeout(struct wo
&io_req->req_flags)) {
/* Handle internally generated ABTS timeout */
BNX2FC_IO_DBG(io_req, "ABTS timed out refcnt = %d\n",
- io_req->refcount.refcount.counter);
+ kref_read(&io_req->refcount));
if (!(test_and_set_bit(BNX2FC_FLAG_ABTS_DONE,
&io_req->req_flags))) {
/*
@@ -1141,7 +1141,7 @@ int bnx2fc_eh_abort(struct scsi_cmnd *sc
return SUCCESS;
}
BNX2FC_IO_DBG(io_req, "eh_abort - refcnt = %d\n",
- io_req->refcount.refcount.counter);
+ kref_read(&io_req->refcount));

/* Hold IO request across abort processing */
kref_get(&io_req->refcount);
@@ -1299,7 +1299,7 @@ void bnx2fc_process_cleanup_compl(struct
{
BNX2FC_IO_DBG(io_req, "Entered process_cleanup_compl "
"refcnt = %d, cmd_type = %d\n",
- io_req->refcount.refcount.counter, io_req->cmd_type);
+ kref_read(&io_req->refcount), io_req->cmd_type);
bnx2fc_scsi_done(io_req, DID_ERROR);
kref_put(&io_req->refcount, bnx2fc_cmd_release);
if (io_req->wait_for_comp)
@@ -1318,7 +1318,7 @@ void bnx2fc_process_abts_compl(struct bn
BNX2FC_IO_DBG(io_req, "Entered process_abts_compl xid = 0x%x"
"refcnt = %d, cmd_type = %d\n",
io_req->xid,
- io_req->refcount.refcount.counter, io_req->cmd_type);
+ kref_read(&io_req->refcount), io_req->cmd_type);

if (test_and_set_bit(BNX2FC_FLAG_ABTS_DONE,
&io_req->req_flags)) {
--- a/drivers/scsi/cxgbi/libcxgbi.h
+++ b/drivers/scsi/cxgbi/libcxgbi.h
@@ -300,7 +300,7 @@ static inline void __cxgbi_sock_put(cons
{
log_debug(1 << CXGBI_DBG_SOCK,
"%s, put csk 0x%p, ref %u-1.\n",
- fn, csk, atomic_read(&csk->refcnt.refcount));
+ fn, csk, kref_read(&csk->refcnt));
kref_put(&csk->refcnt, cxgbi_sock_free);
}
#define cxgbi_sock_put(csk) __cxgbi_sock_put(__func__, csk)
@@ -309,7 +309,7 @@ static inline void __cxgbi_sock_get(cons
{
log_debug(1 << CXGBI_DBG_SOCK,
"%s, get csk 0x%p, ref %u+1.\n",
- fn, csk, atomic_read(&csk->refcnt.refcount));
+ fn, csk, kref_read(&csk->refcnt));
kref_get(&csk->refcnt);
}
#define cxgbi_sock_get(csk) __cxgbi_sock_get(__func__, csk)
--- a/drivers/scsi/lpfc/lpfc_debugfs.c
+++ b/drivers/scsi/lpfc/lpfc_debugfs.c
@@ -607,7 +607,7 @@ lpfc_debugfs_nodelist_data(struct lpfc_v
len += snprintf(buf+len, size-len, "usgmap:%x ",
ndlp->nlp_usg_map);
len += snprintf(buf+len, size-len, "refcnt:%x",
- atomic_read(&ndlp->kref.refcount));
+ kref_read(&ndlp->kref));
len += snprintf(buf+len, size-len, "\n");
}
spin_unlock_irq(shost->host_lock);
--- a/drivers/scsi/lpfc/lpfc_els.c
+++ b/drivers/scsi/lpfc/lpfc_els.c
@@ -3688,7 +3688,7 @@ lpfc_mbx_cmpl_dflt_rpi(struct lpfc_hba *
lpfc_printf_vlog(ndlp->vport, KERN_INFO, LOG_NODE,
"0006 rpi%x DID:%x flg:%x %d map:%x %p\n",
ndlp->nlp_rpi, ndlp->nlp_DID, ndlp->nlp_flag,
- atomic_read(&ndlp->kref.refcount),
+ kref_read(&ndlp->kref),
ndlp->nlp_usg_map, ndlp);
if (NLP_CHK_NODE_ACT(ndlp)) {
lpfc_nlp_put(ndlp);
--- a/drivers/scsi/lpfc/lpfc_hbadisc.c
+++ b/drivers/scsi/lpfc/lpfc_hbadisc.c
@@ -3440,7 +3440,7 @@ lpfc_mbx_cmpl_reg_login(struct lpfc_hba
lpfc_printf_vlog(vport, KERN_INFO, LOG_SLI,
"0002 rpi:%x DID:%x flg:%x %d map:%x %p\n",
ndlp->nlp_rpi, ndlp->nlp_DID, ndlp->nlp_flag,
- atomic_read(&ndlp->kref.refcount),
+ kref_read(&ndlp->kref),
ndlp->nlp_usg_map, ndlp);
if (ndlp->nlp_flag & NLP_REG_LOGIN_SEND)
ndlp->nlp_flag &= ~NLP_REG_LOGIN_SEND;
@@ -3861,7 +3861,7 @@ lpfc_mbx_cmpl_ns_reg_login(struct lpfc_h
lpfc_printf_vlog(vport, KERN_INFO, LOG_SLI,
"0003 rpi:%x DID:%x flg:%x %d map%x %p\n",
ndlp->nlp_rpi, ndlp->nlp_DID, ndlp->nlp_flag,
- atomic_read(&ndlp->kref.refcount),
+ kref_read(&ndlp->kref),
ndlp->nlp_usg_map, ndlp);

if (vport->port_state < LPFC_VPORT_READY) {
@@ -4238,7 +4238,7 @@ lpfc_enable_node(struct lpfc_vport *vpor
"0277 lpfc_enable_node: ndlp:x%p "
"usgmap:x%x refcnt:%d\n",
(void *)ndlp, ndlp->nlp_usg_map,
- atomic_read(&ndlp->kref.refcount));
+ kref_read(&ndlp->kref));
return NULL;
}
/* The ndlp should not already be in active mode */
@@ -4248,7 +4248,7 @@ lpfc_enable_node(struct lpfc_vport *vpor
"0278 lpfc_enable_node: ndlp:x%p "
"usgmap:x%x refcnt:%d\n",
(void *)ndlp, ndlp->nlp_usg_map,
- atomic_read(&ndlp->kref.refcount));
+ kref_read(&ndlp->kref));
return NULL;
}

@@ -4272,7 +4272,7 @@ lpfc_enable_node(struct lpfc_vport *vpor
"0008 rpi:%x DID:%x flg:%x refcnt:%d "
"map:%x %p\n", ndlp->nlp_rpi, ndlp->nlp_DID,
ndlp->nlp_flag,
- atomic_read(&ndlp->kref.refcount),
+ kref_read(&ndlp->kref),
ndlp->nlp_usg_map, ndlp);
}

@@ -4546,7 +4546,7 @@ lpfc_unreg_rpi(struct lpfc_vport *vport,
(bf_get(lpfc_sli_intf_if_type,
&phba->sli4_hba.sli_intf) ==
LPFC_SLI_INTF_IF_TYPE_2) &&
- (atomic_read(&ndlp->kref.refcount) > 0)) {
+ (kref_read(&ndlp->kref) > 0)) {
mbox->context1 = lpfc_nlp_get(ndlp);
mbox->mbox_cmpl =
lpfc_sli4_unreg_rpi_cmpl_clr;
@@ -4695,14 +4695,14 @@ lpfc_cleanup_node(struct lpfc_vport *vpo
"0280 lpfc_cleanup_node: ndlp:x%p "
"usgmap:x%x refcnt:%d\n",
(void *)ndlp, ndlp->nlp_usg_map,
- atomic_read(&ndlp->kref.refcount));
+ kref_read(&ndlp->kref));
lpfc_dequeue_node(vport, ndlp);
} else {
lpfc_printf_vlog(vport, KERN_WARNING, LOG_NODE,
"0281 lpfc_cleanup_node: ndlp:x%p "
"usgmap:x%x refcnt:%d\n",
(void *)ndlp, ndlp->nlp_usg_map,
- atomic_read(&ndlp->kref.refcount));
+ kref_read(&ndlp->kref));
lpfc_disable_node(vport, ndlp);
}

@@ -4791,7 +4791,7 @@ lpfc_nlp_remove(struct lpfc_vport *vport
lpfc_printf_vlog(vport, KERN_INFO, LOG_NODE,
"0005 rpi:%x DID:%x flg:%x %d map:%x %p\n",
ndlp->nlp_rpi, ndlp->nlp_DID, ndlp->nlp_flag,
- atomic_read(&ndlp->kref.refcount),
+ kref_read(&ndlp->kref),
ndlp->nlp_usg_map, ndlp);
if ((mbox = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL))
!= NULL) {
@@ -5557,7 +5557,7 @@ lpfc_mbx_cmpl_fdmi_reg_login(struct lpfc
lpfc_printf_vlog(vport, KERN_INFO, LOG_SLI,
"0004 rpi:%x DID:%x flg:%x %d map:%x %p\n",
ndlp->nlp_rpi, ndlp->nlp_DID, ndlp->nlp_flag,
- atomic_read(&ndlp->kref.refcount),
+ kref_read(&ndlp->kref),
ndlp->nlp_usg_map, ndlp);
/*
* Start issuing Fabric-Device Management Interface (FDMI) command to
@@ -5728,7 +5728,7 @@ lpfc_nlp_init(struct lpfc_vport *vport,
"0007 rpi:%x DID:%x flg:%x refcnt:%d "
"map:%x %p\n", ndlp->nlp_rpi, ndlp->nlp_DID,
ndlp->nlp_flag,
- atomic_read(&ndlp->kref.refcount),
+ kref_read(&ndlp->kref),
ndlp->nlp_usg_map, ndlp);

ndlp->active_rrqs_xri_bitmap =
@@ -5767,7 +5767,7 @@ lpfc_nlp_release(struct kref *kref)
"0279 lpfc_nlp_release: ndlp:x%p did %x "
"usgmap:x%x refcnt:%d rpi:%x\n",
(void *)ndlp, ndlp->nlp_DID, ndlp->nlp_usg_map,
- atomic_read(&ndlp->kref.refcount), ndlp->nlp_rpi);
+ kref_read(&ndlp->kref), ndlp->nlp_rpi);

/* remove ndlp from action. */
lpfc_nlp_remove(ndlp->vport, ndlp);
@@ -5804,7 +5804,7 @@ lpfc_nlp_get(struct lpfc_nodelist *ndlp)
lpfc_debugfs_disc_trc(ndlp->vport, LPFC_DISC_TRC_NODE,
"node get: did:x%x flg:x%x refcnt:x%x",
ndlp->nlp_DID, ndlp->nlp_flag,
- atomic_read(&ndlp->kref.refcount));
+ kref_read(&ndlp->kref));
/* The check of ndlp usage to prevent incrementing the
* ndlp reference count that is in the process of being
* released.
@@ -5817,7 +5817,7 @@ lpfc_nlp_get(struct lpfc_nodelist *ndlp)
"0276 lpfc_nlp_get: ndlp:x%p "
"usgmap:x%x refcnt:%d\n",
(void *)ndlp, ndlp->nlp_usg_map,
- atomic_read(&ndlp->kref.refcount));
+ kref_read(&ndlp->kref));
return NULL;
} else
kref_get(&ndlp->kref);
@@ -5844,7 +5844,7 @@ lpfc_nlp_put(struct lpfc_nodelist *ndlp)
lpfc_debugfs_disc_trc(ndlp->vport, LPFC_DISC_TRC_NODE,
"node put: did:x%x flg:x%x refcnt:x%x",
ndlp->nlp_DID, ndlp->nlp_flag,
- atomic_read(&ndlp->kref.refcount));
+ kref_read(&ndlp->kref));
phba = ndlp->phba;
spin_lock_irqsave(&phba->ndlp_lock, flags);
/* Check the ndlp memory free acknowledge flag to avoid the
@@ -5857,7 +5857,7 @@ lpfc_nlp_put(struct lpfc_nodelist *ndlp)
"0274 lpfc_nlp_put: ndlp:x%p "
"usgmap:x%x refcnt:%d\n",
(void *)ndlp, ndlp->nlp_usg_map,
- atomic_read(&ndlp->kref.refcount));
+ kref_read(&ndlp->kref));
return 1;
}
/* Check the ndlp inactivate log flag to avoid the possible
@@ -5870,7 +5870,7 @@ lpfc_nlp_put(struct lpfc_nodelist *ndlp)
"0275 lpfc_nlp_put: ndlp:x%p "
"usgmap:x%x refcnt:%d\n",
(void *)ndlp, ndlp->nlp_usg_map,
- atomic_read(&ndlp->kref.refcount));
+ kref_read(&ndlp->kref));
return 1;
}
/* For last put, mark the ndlp usage flags to make sure no
@@ -5878,7 +5878,7 @@ lpfc_nlp_put(struct lpfc_nodelist *ndlp)
* in between the process when the final kref_put has been
* invoked on this ndlp.
*/
- if (atomic_read(&ndlp->kref.refcount) == 1) {
+ if (kref_read(&ndlp->kref) == 1) {
/* Indicate ndlp is put to inactive state. */
NLP_SET_IACT_REQ(ndlp);
/* Acknowledge ndlp memory free has been seen. */
@@ -5906,8 +5906,8 @@ lpfc_nlp_not_used(struct lpfc_nodelist *
lpfc_debugfs_disc_trc(ndlp->vport, LPFC_DISC_TRC_NODE,
"node not used: did:x%x flg:x%x refcnt:x%x",
ndlp->nlp_DID, ndlp->nlp_flag,
- atomic_read(&ndlp->kref.refcount));
- if (atomic_read(&ndlp->kref.refcount) == 1)
+ kref_read(&ndlp->kref));
+ if (kref_read(&ndlp->kref) == 1)
if (lpfc_nlp_put(ndlp))
return 1;
return 0;
--- a/drivers/scsi/lpfc/lpfc_init.c
+++ b/drivers/scsi/lpfc/lpfc_init.c
@@ -2660,8 +2660,7 @@ lpfc_cleanup(struct lpfc_vport *vport)
"usgmap:x%x refcnt:%d\n",
ndlp->nlp_DID, (void *)ndlp,
ndlp->nlp_usg_map,
- atomic_read(
- &ndlp->kref.refcount));
+ kref_read(&ndlp->kref));
}
break;
}
--- a/drivers/scsi/qla2xxx/tcm_qla2xxx.c
+++ b/drivers/scsi/qla2xxx/tcm_qla2xxx.c
@@ -371,7 +371,7 @@ static int tcm_qla2xxx_write_pending(str
*/
pr_debug("write_pending aborted cmd[%p] refcount %d "
"transport_state %x, t_state %x, se_cmd_flags %x\n",
- cmd,cmd->se_cmd.cmd_kref.refcount.counter,
+ cmd, kref_read(&cmd->se_cmd.cmd_kref),
cmd->se_cmd.transport_state,
cmd->se_cmd.t_state,
cmd->se_cmd.se_cmd_flags);
@@ -584,7 +584,7 @@ static int tcm_qla2xxx_queue_data_in(str
*/
pr_debug("queue_data_in aborted cmd[%p] refcount %d "
"transport_state %x, t_state %x, se_cmd_flags %x\n",
- cmd,cmd->se_cmd.cmd_kref.refcount.counter,
+ cmd, kref_read(&cmd->se_cmd.cmd_kref),
cmd->se_cmd.transport_state,
cmd->se_cmd.t_state,
cmd->se_cmd.se_cmd_flags);
--- a/drivers/staging/android/ion/ion.c
+++ b/drivers/staging/android/ion/ion.c
@@ -1300,7 +1300,7 @@ static int ion_debug_heap_show(struct se
seq_printf(s, "%16s %16u %16zu %d %d\n",
buffer->task_comm, buffer->pid,
buffer->size, buffer->kmap_cnt,
- atomic_read(&buffer->ref.refcount));
+ kref_read(&buffer->ref));
total_orphaned_size += buffer->size;
}
}
--- a/drivers/staging/comedi/comedi_buf.c
+++ b/drivers/staging/comedi/comedi_buf.c
@@ -188,7 +188,7 @@ bool comedi_buf_is_mmapped(struct comedi
{
struct comedi_buf_map *bm = s->async->buf_map;

- return bm && (atomic_read(&bm->refcount.refcount) > 1);
+ return bm && (kref_read(&bm->refcount) > 1);
}

int comedi_buf_alloc(struct comedi_device *dev, struct comedi_subdevice *s,
--- a/drivers/target/target_core_pr.c
+++ b/drivers/target/target_core_pr.c
@@ -787,7 +787,7 @@ static struct t10_pr_registration *__cor
* __core_scsi3_add_registration()
*/
dest_lun = rcu_dereference_check(deve_tmp->se_lun,
- atomic_read(&deve_tmp->pr_kref.refcount) != 0);
+ kref_read(&deve_tmp->pr_kref) != 0);

pr_reg_atp = __core_scsi3_do_alloc_registration(dev,
nacl_tmp, dest_lun, deve_tmp,
@@ -1462,7 +1462,7 @@ static int core_scsi3_lunacl_depend_item
* For nacl->dynamic_node_acl=1
*/
lun_acl = rcu_dereference_check(se_deve->se_lun_acl,
- atomic_read(&se_deve->pr_kref.refcount) != 0);
+ kref_read(&se_deve->pr_kref) != 0);
if (!lun_acl)
return 0;

@@ -1477,7 +1477,7 @@ static void core_scsi3_lunacl_undepend_i
* For nacl->dynamic_node_acl=1
*/
lun_acl = rcu_dereference_check(se_deve->se_lun_acl,
- atomic_read(&se_deve->pr_kref.refcount) != 0);
+ kref_read(&se_deve->pr_kref) != 0);
if (!lun_acl) {
kref_put(&se_deve->pr_kref, target_pr_kref_release);
return;
@@ -1758,7 +1758,7 @@ core_scsi3_decode_spec_i_port(
* 2nd loop which will never fail.
*/
dest_lun = rcu_dereference_check(dest_se_deve->se_lun,
- atomic_read(&dest_se_deve->pr_kref.refcount) != 0);
+ kref_read(&dest_se_deve->pr_kref) != 0);

dest_pr_reg = __core_scsi3_alloc_registration(cmd->se_dev,
dest_node_acl, dest_lun, dest_se_deve,
@@ -3465,7 +3465,7 @@ core_scsi3_emulate_pro_register_and_move
iport_ptr);
if (!dest_pr_reg) {
struct se_lun *dest_lun = rcu_dereference_check(dest_se_deve->se_lun,
- atomic_read(&dest_se_deve->pr_kref.refcount) != 0);
+ kref_read(&dest_se_deve->pr_kref) != 0);

spin_unlock(&dev->dev_reservation_lock);
if (core_scsi3_alloc_registration(cmd->se_dev, dest_node_acl,
--- a/drivers/target/tcm_fc/tfc_sess.c
+++ b/drivers/target/tcm_fc/tfc_sess.c
@@ -454,7 +454,7 @@ static void ft_sess_free(struct kref *kr

void ft_sess_put(struct ft_sess *sess)
{
- int sess_held = atomic_read(&sess->kref.refcount);
+ int sess_held = kref_read(&sess->kref);

BUG_ON(!sess_held);
kref_put(&sess->kref, ft_sess_free);
--- a/drivers/usb/gadget/function/f_fs.c
+++ b/drivers/usb/gadget/function/f_fs.c
@@ -3686,7 +3686,7 @@ static void ffs_closed(struct ffs_data *
goto done;

if (opts->no_configfs || !opts->func_inst.group.cg_item.ci_parent
- || !atomic_read(&opts->func_inst.group.cg_item.ci_kref.refcount))
+ || !kref_read(&opts->func_inst.group.cg_item.ci_kref))
goto done;

unregister_gadget_item(ffs_obj->opts->
--- a/fs/exofs/sys.c
+++ b/fs/exofs/sys.c
@@ -122,7 +122,7 @@ void exofs_sysfs_dbg_print(void)
list_for_each_entry_safe(k_name, k_tmp, &exofs_kset->list, entry) {
printk(KERN_INFO "%s: name %s ref %d\n",
__func__, kobject_name(k_name),
- (int)atomic_read(&k_name->kref.refcount));
+ (int)kref_read(&k_name->kref));
}
#endif
}
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -349,7 +349,7 @@ static void sc_show_sock_container(struc
" func key: 0x%08x\n"
" func type: %u\n",
sc,
- atomic_read(&sc->sc_kref.refcount),
+ kref_read(&sc->sc_kref),
&saddr, inet ? ntohs(sport) : 0,
&daddr, inet ? ntohs(dport) : 0,
sc->sc_node->nd_name,
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -97,7 +97,7 @@
typeof(sc) __sc = (sc); \
mlog(ML_SOCKET, "[sc %p refs %d sock %p node %u page %p " \
"pg_off %zu] " fmt, __sc, \
- atomic_read(&__sc->sc_kref.refcount), __sc->sc_sock, \
+ kref_read(&__sc->sc_kref), __sc->sc_sock, \
__sc->sc_node->nd_num, __sc->sc_page, __sc->sc_page_off , \
##args); \
} while (0)
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -81,7 +81,7 @@ static void __dlm_print_lock(struct dlm_
lock->ml.type, lock->ml.convert_type, lock->ml.node,
dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
- atomic_read(&lock->lock_refs.refcount),
+ kref_read(&lock->lock_refs),
(list_empty(&lock->ast_list) ? 'y' : 'n'),
(lock->ast_pending ? 'y' : 'n'),
(list_empty(&lock->bast_list) ? 'y' : 'n'),
@@ -106,7 +106,7 @@ void __dlm_print_one_lock_resource(struc
printk("lockres: %s, owner=%u, state=%u\n",
buf, res->owner, res->state);
printk(" last used: %lu, refcnt: %u, on purge list: %s\n",
- res->last_used, atomic_read(&res->refs.refcount),
+ res->last_used, kref_read(&res->refs),
list_empty(&res->purge) ? "no" : "yes");
printk(" on dirty list: %s, on reco list: %s, "
"migrating pending: %s\n",
@@ -298,7 +298,7 @@ static int dump_mle(struct dlm_master_li
mle_type, mle->master, mle->new_master,
!list_empty(&mle->hb_events),
!!mle->inuse,
- atomic_read(&mle->mle_refs.refcount));
+ kref_read(&mle->mle_refs));

out += snprintf(buf + out, len - out, "Maybe=");
out += stringify_nodemap(mle->maybe_map, O2NM_MAX_NODES,
@@ -494,7 +494,7 @@ static int dump_lock(struct dlm_lock *lo
lock->ast_pending, lock->bast_pending,
lock->convert_pending, lock->lock_pending,
lock->cancel_pending, lock->unlock_pending,
- atomic_read(&lock->lock_refs.refcount));
+ kref_read(&lock->lock_refs));
spin_unlock(&lock->spinlock);

return out;
@@ -521,7 +521,7 @@ static int dump_lockres(struct dlm_lock_
!list_empty(&res->recovering),
res->inflight_locks, res->migration_pending,
atomic_read(&res->asts_reserved),
- atomic_read(&res->refs.refcount));
+ kref_read(&res->refs));

/* refmap */
out += snprintf(buf + out, len - out, "RMAP:");
@@ -777,7 +777,7 @@ static int debug_state_print(struct dlm_
/* Purge Count: xxx Refs: xxx */
out += snprintf(buf + out, len - out,
"Purge Count: %d Refs: %d\n", dlm->purge_count,
- atomic_read(&dlm->dlm_refs.refcount));
+ kref_read(&dlm->dlm_refs));

/* Dead Node: xxx */
out += snprintf(buf + out, len - out,
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -2072,7 +2072,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(c
INIT_LIST_HEAD(&dlm->dlm_eviction_callbacks);

mlog(0, "context init: refcount %u\n",
- atomic_read(&dlm->dlm_refs.refcount));
+ kref_read(&dlm->dlm_refs));

leave:
if (ret < 0 && dlm) {
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -233,7 +233,7 @@ static void __dlm_put_mle(struct dlm_mas

assert_spin_locked(&dlm->spinlock);
assert_spin_locked(&dlm->master_lock);
- if (!atomic_read(&mle->mle_refs.refcount)) {
+ if (!kref_read(&mle->mle_refs)) {
/* this may or may not crash, but who cares.
* it's a BUG. */
mlog(ML_ERROR, "bad mle: %p\n", mle);
@@ -1124,9 +1124,9 @@ static int dlm_wait_for_lock_mastery(str
unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS);

/*
- if (atomic_read(&mle->mle_refs.refcount) < 2)
+ if (kref_read(&mle->mle_refs) < 2)
mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle,
- atomic_read(&mle->mle_refs.refcount),
+ kref_read(&mle->mle_refs),
res->lockname.len, res->lockname.name);
*/
atomic_set(&mle->woken, 0);
@@ -1988,7 +1988,7 @@ int dlm_assert_master_handler(struct o2n
* on this mle. */
spin_lock(&dlm->master_lock);

- rr = atomic_read(&mle->mle_refs.refcount);
+ rr = kref_read(&mle->mle_refs);
if (mle->inuse > 0) {
if (extra_ref && rr < 3)
err = 1;
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -251,7 +251,7 @@ static enum dlm_status dlmunlock_common(
mlog(0, "lock %u:%llu should be gone now! refs=%d\n",
dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
- atomic_read(&lock->lock_refs.refcount)-1);
+ kref_read(&lock->lock_refs)-1);
dlm_lock_put(lock);
}
if (actions & DLM_UNLOCK_CALL_AST)
--- a/include/drm/drm_framebuffer.h
+++ b/include/drm/drm_framebuffer.h
@@ -247,7 +247,7 @@ static inline void drm_framebuffer_unref
*/
static inline uint32_t drm_framebuffer_read_refcount(struct drm_framebuffer *fb)
{
- return atomic_read(&fb->base.refcount.refcount);
+ return kref_read(&fb->base.refcount);
}

/**
--- a/include/drm/ttm/ttm_bo_driver.h
+++ b/include/drm/ttm/ttm_bo_driver.h
@@ -864,7 +864,7 @@ static inline int ttm_bo_reserve(struct
{
int ret;

- WARN_ON(!atomic_read(&bo->kref.refcount));
+ WARN_ON(!kref_read(&bo->kref));

ret = __ttm_bo_reserve(bo, interruptible, no_wait, ticket);
if (likely(ret == 0))
@@ -889,7 +889,7 @@ static inline int ttm_bo_reserve_slowpat
{
int ret = 0;

- WARN_ON(!atomic_read(&bo->kref.refcount));
+ WARN_ON(!kref_read(&bo->kref));

if (interruptible)
ret = ww_mutex_lock_slow_interruptible(&bo->resv->lock,
--- a/include/linux/kref.h
+++ b/include/linux/kref.h
@@ -35,6 +35,11 @@ static inline void kref_init(struct kref
atomic_set(&kref->refcount, 1);
}

+static inline int kref_read(const struct kref *kref)
+{
+ return atomic_read(&kref->refcount);
+}
+
/**
* kref_get - increment refcount for object.
* @kref: object.
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -198,7 +198,7 @@ static inline struct cache_head *cache_

static inline void cache_put(struct cache_head *h, struct cache_detail *cd)
{
- if (atomic_read(&h->ref.refcount) <= 2 &&
+ if (kref_read(&h->ref) <= 2 &&
h->expiry_time < cd->nextcheck)
cd->nextcheck = h->expiry_time;
kref_put(&h->ref, cd->cache_put);
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -987,7 +987,7 @@ static inline void hci_conn_drop(struct
static inline void hci_dev_put(struct hci_dev *d)
{
BT_DBG("%s orig refcnt %d", d->name,
- atomic_read(&d->dev.kobj.kref.refcount));
+ kref_read(&d->dev.kobj.kref));

put_device(&d->dev);
}
@@ -995,7 +995,7 @@ static inline void hci_dev_put(struct hc
static inline struct hci_dev *hci_dev_hold(struct hci_dev *d)
{
BT_DBG("%s orig refcnt %d", d->name,
- atomic_read(&d->dev.kobj.kref.refcount));
+ kref_read(&d->dev.kobj.kref));

get_device(&d->dev);
return d;
--- a/net/bluetooth/6lowpan.c
+++ b/net/bluetooth/6lowpan.c
@@ -920,7 +920,7 @@ static void chan_close_cb(struct l2cap_c
BT_DBG("dev %p removing %speer %p", dev,
last ? "last " : "1 ", peer);
BT_DBG("chan %p orig refcnt %d", chan,
- atomic_read(&chan->kref.refcount));
+ kref_read(&chan->kref));

l2cap_chan_put(chan);
break;
--- a/net/bluetooth/a2mp.c
+++ b/net/bluetooth/a2mp.c
@@ -810,7 +810,7 @@ static struct l2cap_chan *a2mp_chan_open
/* AMP Manager functions */
struct amp_mgr *amp_mgr_get(struct amp_mgr *mgr)
{
- BT_DBG("mgr %p orig refcnt %d", mgr, atomic_read(&mgr->kref.refcount));
+ BT_DBG("mgr %p orig refcnt %d", mgr, kref_read(&mgr->kref));

kref_get(&mgr->kref);

@@ -833,7 +833,7 @@ static void amp_mgr_destroy(struct kref

int amp_mgr_put(struct amp_mgr *mgr)
{
- BT_DBG("mgr %p orig refcnt %d", mgr, atomic_read(&mgr->kref.refcount));
+ BT_DBG("mgr %p orig refcnt %d", mgr, kref_read(&mgr->kref));

return kref_put(&mgr->kref, &amp_mgr_destroy);
}
--- a/net/bluetooth/amp.c
+++ b/net/bluetooth/amp.c
@@ -24,7 +24,7 @@
void amp_ctrl_get(struct amp_ctrl *ctrl)
{
BT_DBG("ctrl %p orig refcnt %d", ctrl,
- atomic_read(&ctrl->kref.refcount));
+ kref_read(&ctrl->kref));

kref_get(&ctrl->kref);
}
@@ -42,7 +42,7 @@ static void amp_ctrl_destroy(struct kref
int amp_ctrl_put(struct amp_ctrl *ctrl)
{
BT_DBG("ctrl %p orig refcnt %d", ctrl,
- atomic_read(&ctrl->kref.refcount));
+ kref_read(&ctrl->kref));

return kref_put(&ctrl->kref, &amp_ctrl_destroy);
}
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -481,14 +481,14 @@ static void l2cap_chan_destroy(struct kr

void l2cap_chan_hold(struct l2cap_chan *c)
{
- BT_DBG("chan %p orig refcnt %d", c, atomic_read(&c->kref.refcount));
+ BT_DBG("chan %p orig refcnt %d", c, kref_read(&c->kref));

kref_get(&c->kref);
}

void l2cap_chan_put(struct l2cap_chan *c)
{
- BT_DBG("chan %p orig refcnt %d", c, atomic_read(&c->kref.refcount));
+ BT_DBG("chan %p orig refcnt %d", c, kref_read(&c->kref));

kref_put(&c->kref, l2cap_chan_destroy);
}
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -3418,7 +3418,7 @@ static void ceph_msg_release(struct kref
struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
{
dout("%s %p (was %d)\n", __func__, msg,
- atomic_read(&msg->kref.refcount));
+ kref_read(&msg->kref));
kref_get(&msg->kref);
return msg;
}
@@ -3427,7 +3427,7 @@ EXPORT_SYMBOL(ceph_msg_get);
void ceph_msg_put(struct ceph_msg *msg)
{
dout("%s %p (was %d)\n", __func__, msg,
- atomic_read(&msg->kref.refcount));
+ kref_read(&msg->kref));
kref_put(&msg->kref, ceph_msg_release);
}
EXPORT_SYMBOL(ceph_msg_put);
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -438,7 +438,7 @@ static void ceph_osdc_release_request(st
void ceph_osdc_get_request(struct ceph_osd_request *req)
{
dout("%s %p (was %d)\n", __func__, req,
- atomic_read(&req->r_kref.refcount));
+ kref_read(&req->r_kref));
kref_get(&req->r_kref);
}
EXPORT_SYMBOL(ceph_osdc_get_request);
@@ -447,7 +447,7 @@ void ceph_osdc_put_request(struct ceph_o
{
if (req) {
dout("%s %p (was %d)\n", __func__, req,
- atomic_read(&req->r_kref.refcount));
+ kref_read(&req->r_kref));
kref_put(&req->r_kref, ceph_osdc_release_request);
}
}
@@ -487,11 +487,11 @@ static void request_reinit(struct ceph_o
struct ceph_msg *reply_msg = req->r_reply;

dout("%s req %p\n", __func__, req);
- WARN_ON(atomic_read(&req->r_kref.refcount) != 1);
+ WARN_ON(kref_read(&req->r_kref) != 1);
request_release_checks(req);

- WARN_ON(atomic_read(&request_msg->kref.refcount) != 1);
- WARN_ON(atomic_read(&reply_msg->kref.refcount) != 1);
+ WARN_ON(kref_read(&request_msg->kref) != 1);
+ WARN_ON(kref_read(&reply_msg->kref) != 1);
target_destroy(&req->r_t);

request_init(req);
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -1358,7 +1358,7 @@ static int c_show(struct seq_file *m, vo
ifdebug(CACHE)
seq_printf(m, "# expiry=%ld refcnt=%d flags=%lx\n",
convert_to_wallclock(cp->expiry_time),
- atomic_read(&cp->ref.refcount), cp->flags);
+ kref_read(&cp->ref), cp->flags);
cache_get(cp);
if (cache_check(cd, cp, NULL))
/* cache_check does a cache_put on failure */
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -490,7 +490,7 @@ static struct svc_xprt *svc_xprt_dequeue
svc_xprt_get(xprt);

dprintk("svc: transport %p dequeued, inuse=%d\n",
- xprt, atomic_read(&xprt->xpt_ref.refcount));
+ xprt, kref_read(&xprt->xpt_ref));
}
spin_unlock_bh(&pool->sp_lock);
out:
@@ -820,7 +820,7 @@ static int svc_handle_xprt(struct svc_rq
/* XPT_DATA|XPT_DEFERRED case: */
dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n",
rqstp, rqstp->rq_pool->sp_id, xprt,
- atomic_read(&xprt->xpt_ref.refcount));
+ kref_read(&xprt->xpt_ref));
rqstp->rq_deferred = svc_deferred_dequeue(xprt);
if (rqstp->rq_deferred)
len = svc_deferred_recv(rqstp);
@@ -978,7 +978,7 @@ static void svc_age_temp_xprts(unsigned
* through, close it. */
if (!test_and_set_bit(XPT_OLD, &xprt->xpt_flags))
continue;
- if (atomic_read(&xprt->xpt_ref.refcount) > 1 ||
+ if (kref_read(&xprt->xpt_ref) > 1 ||
test_bit(XPT_BUSY, &xprt->xpt_flags))
continue;
list_del_init(le);
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -1222,9 +1222,9 @@ static void __svc_rdma_free(struct work_
ib_drain_qp(rdma->sc_qp);

/* We should only be called from kref_put */
- if (atomic_read(&xprt->xpt_ref.refcount) != 0)
+ if (kref_read(&xprt->xpt_ref) != 0)
pr_err("svcrdma: sc_xprt still in use? (%d)\n",
- atomic_read(&xprt->xpt_ref.refcount));
+ kref_read(&xprt->xpt_ref));

/*
* Destroy queued, but not processed read completions. Note



2016-11-14 18:17:09

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/7] kref: Add kref_read()

On Mon, Nov 14, 2016 at 06:39:48PM +0100, Peter Zijlstra wrote:
> Since we need to change the implementation, stop exposing internals.
>
> Provide kref_read() to read the current reference count; typically
> used for debug messages.

Can we just provide a printk specifier for a kref value instead as
that is the only valid use case for reading the value?

2016-11-15 07:28:49

by Greg Kroah-Hartman

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/7] kref: Add kref_read()

On Mon, Nov 14, 2016 at 10:16:55AM -0800, Christoph Hellwig wrote:
> On Mon, Nov 14, 2016 at 06:39:48PM +0100, Peter Zijlstra wrote:
> > Since we need to change the implementation, stop exposing internals.
> >
> > Provide kref_read() to read the current reference count; typically
> > used for debug messages.
>
> Can we just provide a printk specifier for a kref value instead as
> that is the only valid use case for reading the value?

Yeah, that would be great as no one should be doing anything
logic-related based on the kref value.

thanks,

greg k-h

2016-11-15 07:33:16

by Greg Kroah-Hartman

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/7] kref: Add kref_read()

On Mon, Nov 14, 2016 at 06:39:48PM +0100, Peter Zijlstra wrote:
> Since we need to change the implementation, stop exposing internals.
>
> Provide kref_read() to read the current reference count; typically
> used for debug messages.
>
> Kills two anti-patterns:
>
> atomic_read(&kref->refcount)
> kref->refcount.counter
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
> drivers/block/drbd/drbd_req.c | 2 -
> drivers/block/rbd.c | 8 ++---
> drivers/block/virtio_blk.c | 2 -
> drivers/gpu/drm/drm_gem_cma_helper.c | 2 -
> drivers/gpu/drm/drm_info.c | 2 -
> drivers/gpu/drm/drm_mode_object.c | 4 +-
> drivers/gpu/drm/etnaviv/etnaviv_gem.c | 2 -
> drivers/gpu/drm/msm/msm_gem.c | 2 -
> drivers/gpu/drm/nouveau/nouveau_fence.c | 2 -
> drivers/gpu/drm/omapdrm/omap_gem.c | 2 -
> drivers/gpu/drm/ttm/ttm_bo.c | 4 +-
> drivers/gpu/drm/ttm/ttm_object.c | 2 -
> drivers/infiniband/hw/cxgb3/iwch_cm.h | 6 ++--
> drivers/infiniband/hw/cxgb3/iwch_qp.c | 2 -
> drivers/infiniband/hw/cxgb4/iw_cxgb4.h | 6 ++--
> drivers/infiniband/hw/cxgb4/qp.c | 2 -
> drivers/infiniband/hw/usnic/usnic_ib_sysfs.c | 6 ++--
> drivers/infiniband/hw/usnic/usnic_ib_verbs.c | 4 +-
> drivers/misc/genwqe/card_dev.c | 2 -
> drivers/misc/mei/debugfs.c | 2 -
> drivers/pci/hotplug/pnv_php.c | 2 -
> drivers/pci/slot.c | 2 -
> drivers/scsi/bnx2fc/bnx2fc_io.c | 8 ++---
> drivers/scsi/cxgbi/libcxgbi.h | 4 +-
> drivers/scsi/lpfc/lpfc_debugfs.c | 2 -
> drivers/scsi/lpfc/lpfc_els.c | 2 -
> drivers/scsi/lpfc/lpfc_hbadisc.c | 40 +++++++++++++--------------
> drivers/scsi/lpfc/lpfc_init.c | 3 --
> drivers/scsi/qla2xxx/tcm_qla2xxx.c | 4 +-
> drivers/staging/android/ion/ion.c | 2 -
> drivers/staging/comedi/comedi_buf.c | 2 -
> drivers/target/target_core_pr.c | 10 +++---
> drivers/target/tcm_fc/tfc_sess.c | 2 -
> drivers/usb/gadget/function/f_fs.c | 2 -
> fs/exofs/sys.c | 2 -
> fs/ocfs2/cluster/netdebug.c | 2 -
> fs/ocfs2/cluster/tcp.c | 2 -
> fs/ocfs2/dlm/dlmdebug.c | 12 ++++----
> fs/ocfs2/dlm/dlmdomain.c | 2 -
> fs/ocfs2/dlm/dlmmaster.c | 8 ++---
> fs/ocfs2/dlm/dlmunlock.c | 2 -
> include/drm/drm_framebuffer.h | 2 -
> include/drm/ttm/ttm_bo_driver.h | 4 +-
> include/linux/kref.h | 5 +++
> include/linux/sunrpc/cache.h | 2 -
> include/net/bluetooth/hci_core.h | 4 +-
> net/bluetooth/6lowpan.c | 2 -
> net/bluetooth/a2mp.c | 4 +-
> net/bluetooth/amp.c | 4 +-
> net/bluetooth/l2cap_core.c | 4 +-
> net/ceph/messenger.c | 4 +-
> net/ceph/osd_client.c | 10 +++---
> net/sunrpc/cache.c | 2 -
> net/sunrpc/svc_xprt.c | 6 ++--
> net/sunrpc/xprtrdma/svc_rdma_transport.c | 4 +-
> 55 files changed, 120 insertions(+), 116 deletions(-)
>
> --- a/drivers/block/drbd/drbd_req.c
> +++ b/drivers/block/drbd/drbd_req.c
> @@ -520,7 +520,7 @@ static void mod_rq_state(struct drbd_req
> /* Completion does it's own kref_put. If we are going to
> * kref_sub below, we need req to be still around then. */
> int at_least = k_put + !!c_put;
> - int refcount = atomic_read(&req->kref.refcount);
> + int refcount = kref_read(&req->kref);
> if (refcount < at_least)
> drbd_err(device,
> "mod_rq_state: Logic BUG: %x -> %x: refcount = %d, should be >= %d\n",

As proof of "things you should never do", here is one such example.

ugh.


> --- a/drivers/block/virtio_blk.c
> +++ b/drivers/block/virtio_blk.c
> @@ -767,7 +767,7 @@ static void virtblk_remove(struct virtio
> /* Stop all the virtqueues. */
> vdev->config->reset(vdev);
>
> - refc = atomic_read(&disk_to_dev(vblk->disk)->kobj.kref.refcount);
> + refc = kref_read(&disk_to_dev(vblk->disk)->kobj.kref);
> put_disk(vblk->disk);
> vdev->config->del_vqs(vdev);
> kfree(vblk->vqs);

And this too, ugh, that's a huge abuse and is probably totally wrong...

thanks again for digging through this crap. I wonder if we need to name
the kref reference variable "do_not_touch_this_ever" or some such thing
to catch all of the people who try to be "too smart".

greg k-h

2016-11-15 07:47:15

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/7] kref: Add kref_read()

On Tue, Nov 15, 2016 at 08:28:55AM +0100, Greg KH wrote:
> On Mon, Nov 14, 2016 at 10:16:55AM -0800, Christoph Hellwig wrote:
> > On Mon, Nov 14, 2016 at 06:39:48PM +0100, Peter Zijlstra wrote:
> > > Since we need to change the implementation, stop exposing internals.
> > >
> > > Provide kref_read() to read the current reference count; typically
> > > used for debug messages.
> >
> > Can we just provide a printk specifier for a kref value instead as
> > that is the only valid use case for reading the value?
>
> Yeah, that would be great as no one should be doing anything
> logic-related based on the kref value.

IIRC there are a few users that WARN_ON() the value with a minimum bound
or somesuch. Those would be left in the cold, but yes I too like the
idea of a printk() format specifier.

2016-11-15 08:03:18

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/7] kref: Add kref_read()

On Tue, Nov 15, 2016 at 08:33:22AM +0100, Greg KH wrote:
> On Mon, Nov 14, 2016 at 06:39:48PM +0100, Peter Zijlstra wrote:

> > --- a/drivers/block/drbd/drbd_req.c
> > +++ b/drivers/block/drbd/drbd_req.c
> > @@ -520,7 +520,7 @@ static void mod_rq_state(struct drbd_req
> > /* Completion does it's own kref_put. If we are going to
> > * kref_sub below, we need req to be still around then. */
> > int at_least = k_put + !!c_put;
> > - int refcount = atomic_read(&req->kref.refcount);
> > + int refcount = kref_read(&req->kref);
> > if (refcount < at_least)
> > drbd_err(device,
> > "mod_rq_state: Logic BUG: %x -> %x: refcount = %d, should be >= %d\n",
>
> As proof of "things you should never do", here is one such example.
>
> ugh.
>
>
> > --- a/drivers/block/virtio_blk.c
> > +++ b/drivers/block/virtio_blk.c
> > @@ -767,7 +767,7 @@ static void virtblk_remove(struct virtio
> > /* Stop all the virtqueues. */
> > vdev->config->reset(vdev);
> >
> > - refc = atomic_read(&disk_to_dev(vblk->disk)->kobj.kref.refcount);
> > + refc = kref_read(&disk_to_dev(vblk->disk)->kobj.kref);
> > put_disk(vblk->disk);
> > vdev->config->del_vqs(vdev);
> > kfree(vblk->vqs);
>
> And this too, ugh, that's a huge abuse and is probably totally wrong...
>
> thanks again for digging through this crap. I wonder if we need to name
> the kref reference variable "do_not_touch_this_ever" or some such thing
> to catch all of the people who try to be "too smart".

There's unimaginable bong hits involved in this stuff, in the end I
resorted to brute force and scripts to convert all this.

2016-11-15 08:37:42

by Ingo Molnar

[permalink] [raw]
Subject: [PATCH] printk, locking/atomics, kref: Introduce new %pAr and %pAk format string options for atomic_t and 'struct kref'


* Greg KH <[email protected]> wrote:

> On Mon, Nov 14, 2016 at 10:16:55AM -0800, Christoph Hellwig wrote:
> > On Mon, Nov 14, 2016 at 06:39:48PM +0100, Peter Zijlstra wrote:
> > > Since we need to change the implementation, stop exposing internals.
> > >
> > > Provide kref_read() to read the current reference count; typically
> > > used for debug messages.
> >
> > Can we just provide a printk specifier for a kref value instead as
> > that is the only valid use case for reading the value?
>
> Yeah, that would be great as no one should be doing anything
> logic-related based on the kref value.

Find below a patch that implements %pAk for 'struct kref' count printing and
%pAr for atomic_t counter printing.

This is against vanilla upstream.

Thanks,

Ingo

============================>
Subject: printk, locking/atomics, kref: Introduce new %pAr and %pAk format string options for atomic_t and 'struct kref'
From: Ingo Molnar <[email protected]>
Date: Tue Nov 15 08:53:14 CET 2016

A decade of kref internals exposed to driver writers has proven that
exposing internals to them is a bad idea.

Make the bad patterns a bit easier to detect and allow cleaner
printouts by offering two new printk format string extensions:

%pAr - print the atomic_t count in decimal
%pAk - print the struct kref count in decimal

Also add printf testcases:

[ 0.353126] test_printf: all 268 tests passed

Cc: Arnaldo Carvalho de Melo <[email protected]>
Cc: Greg Kroah-Hartman <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
---
Documentation/printk-formats.txt | 10 +++++++++
lib/test_printf.c | 28 ++++++++++++++++++++++++++
lib/vsprintf.c | 42 +++++++++++++++++++++++++++++++++++++++
3 files changed, 80 insertions(+)

Index: tip/Documentation/printk-formats.txt
===================================================================
--- tip.orig/Documentation/printk-formats.txt
+++ tip/Documentation/printk-formats.txt
@@ -316,6 +316,16 @@ Flags bitfields such as page flags, gfp_

Passed by reference.

+atomic variables such atomic_t or struct kref:
+
+ %pAr atomic_t count
+ %pAk struct kref count
+
+ For printing the current count value of atomic variables. This is
+ preferred to accessing the counts directly.
+
+ Passed by reference.
+
Network device features:

%pNF 0x000000000000c000
Index: tip/lib/test_printf.c
===================================================================
--- tip.orig/lib/test_printf.c
+++ tip/lib/test_printf.c
@@ -20,6 +20,8 @@
#include <linux/gfp.h>
#include <linux/mm.h>

+#include <linux/kref.h>
+
#define BUF_SIZE 256
#define PAD_SIZE 16
#define FILL_CHAR '$'
@@ -462,6 +464,31 @@ flags(void)
kfree(cmp_buffer);
}

+/*
+ * Testcases for %pAr (atomic_t) and %pAk (struct kref) count printing:
+ */
+static void __init test_atomics__atomic_t(void)
+{
+ atomic_t count = ATOMIC_INIT(1);
+
+ test("1", "%pAr", &count);
+}
+
+static void __init test_atomics__kref(void)
+{
+ struct kref kref;
+
+ kref_init(&kref);
+
+ test("1", "%pAk", &kref);
+}
+
+static void __init test_atomics(void)
+{
+ test_atomics__atomic_t();
+ test_atomics__kref();
+}
+
static void __init
test_pointer(void)
{
@@ -481,6 +508,7 @@ test_pointer(void)
bitmap();
netdev_features();
flags();
+ test_atomics();
}

static int __init
Index: tip/lib/vsprintf.c
===================================================================
--- tip.orig/lib/vsprintf.c
+++ tip/lib/vsprintf.c
@@ -38,6 +38,8 @@

#include "../mm/internal.h" /* For the trace_print_flags arrays */

+#include <linux/kref.h>
+
#include <asm/page.h> /* for PAGE_SIZE */
#include <asm/sections.h> /* for dereference_function_descriptor() */
#include <asm/byteorder.h> /* cpu_to_le16 */
@@ -1470,6 +1472,40 @@ char *flags_string(char *buf, char *end,
return format_flags(buf, end, flags, names);
}

+static noinline_for_stack
+char *atomic_var(char *buf, char *end, void *atomic_ptr, const char *fmt)
+{
+ unsigned long num;
+ const struct printf_spec numspec = {
+ .flags = SPECIAL|SMALL,
+ .field_width = -1,
+ .precision = -1,
+ .base = 10,
+ };
+
+ switch (fmt[1]) {
+ case 'r':
+ {
+ atomic_t *count_p = (void *)atomic_ptr;
+
+ num = atomic_read(count_p);
+ break;
+ }
+ case 'k':
+ {
+ struct kref *kref_p = (void *)atomic_ptr;
+
+ num = atomic_read(&kref_p->refcount);
+ break;
+ }
+ default:
+ WARN_ONCE(1, "Unsupported atomics modifier: %c\n", fmt[1]);
+ return buf;
+ }
+
+ return number(buf, end, num, numspec);
+}
+
int kptr_restrict __read_mostly;

/*
@@ -1563,6 +1599,10 @@ int kptr_restrict __read_mostly;
* p page flags (see struct page) given as pointer to unsigned long
* g gfp flags (GFP_* and __GFP_*) given as pointer to gfp_t
* v vma flags (VM_*) given as pointer to unsigned long
+ * - 'A' For the count of atomic variables to be printed.
+ * Supported flags given by option:
+ * r atomic_t ('r'aw count)
+ * k struct kref ('k'ref count)
*
* ** Please update also Documentation/printk-formats.txt when making changes **
*
@@ -1718,6 +1758,8 @@ char *pointer(const char *fmt, char *buf

case 'G':
return flags_string(buf, end, ptr, fmt);
+ case 'A':
+ return atomic_var(buf, end, ptr, fmt);
}
spec.flags |= SMALL;
if (spec.field_width == -1) {

2016-11-15 08:44:11

by Ingo Molnar

[permalink] [raw]
Subject: [PATCH v2] printk, locking/atomics, kref: Introduce new %pAr and %pAk format string options for atomic_t and 'struct kref'


* Ingo Molnar <[email protected]> wrote:

>
> * Greg KH <[email protected]> wrote:
>
> > On Mon, Nov 14, 2016 at 10:16:55AM -0800, Christoph Hellwig wrote:
> > > On Mon, Nov 14, 2016 at 06:39:48PM +0100, Peter Zijlstra wrote:
> > > > Since we need to change the implementation, stop exposing internals.
> > > >
> > > > Provide kref_read() to read the current reference count; typically
> > > > used for debug messages.
> > >
> > > Can we just provide a printk specifier for a kref value instead as
> > > that is the only valid use case for reading the value?
> >
> > Yeah, that would be great as no one should be doing anything
> > logic-related based on the kref value.
>
> Find below a patch that implements %pAk for 'struct kref' count printing and
> %pAr for atomic_t counter printing.
>
> This is against vanilla upstream.

The patch below is against Peter's refcount series. Note that this patch depends
on this patch in Peter's series:

kref: Implement using refcount_t

Thanks,

Ingo

==================================>
Subject: printk, locking/atomics, kref: Introduce new %pAr and %pAk format string options for atomic_t and 'struct kref'
From: Ingo Molnar <[email protected]>
Date: Tue Nov 15 08:53:14 CET 2016

A decade of kref internals exposed to driver writers has proven that
exposing internals to them is a bad idea.

Make the bad patterns a bit easier to detect and allow cleaner
printouts by offering two new printk format string extensions:

%pAr - print the atomic_t count in decimal
%pAk - print the struct kref count in decimal

Also add printf testcases:

[ 0.328495] test_printf: all 268 tests passed

Cc: Arnaldo Carvalho de Melo <[email protected]>
Cc: Greg Kroah-Hartman <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
---
Documentation/printk-formats.txt | 10 +++++++++
lib/test_printf.c | 28 ++++++++++++++++++++++++++
lib/vsprintf.c | 42 +++++++++++++++++++++++++++++++++++++++
3 files changed, 80 insertions(+)

Index: tip/Documentation/printk-formats.txt
===================================================================
--- tip.orig/Documentation/printk-formats.txt
+++ tip/Documentation/printk-formats.txt
@@ -316,6 +316,16 @@ Flags bitfields such as page flags, gfp_

Passed by reference.

+atomic variables such atomic_t or struct kref:
+
+ %pAr atomic_t count
+ %pAk struct kref count
+
+ For printing the current count value of atomic variables. This is
+ preferred to accessing the counts directly.
+
+ Passed by reference.
+
Network device features:

%pNF 0x000000000000c000
Index: tip/lib/test_printf.c
===================================================================
--- tip.orig/lib/test_printf.c
+++ tip/lib/test_printf.c
@@ -20,6 +20,8 @@
#include <linux/gfp.h>
#include <linux/mm.h>

+#include <linux/kref.h>
+
#define BUF_SIZE 256
#define PAD_SIZE 16
#define FILL_CHAR '$'
@@ -462,6 +464,31 @@ flags(void)
kfree(cmp_buffer);
}

+/*
+ * Testcases for %pAr (atomic_t) and %pAk (struct kref) count printing:
+ */
+static void __init test_atomics__atomic_t(void)
+{
+ atomic_t count = ATOMIC_INIT(1);
+
+ test("1", "%pAr", &count);
+}
+
+static void __init test_atomics__kref(void)
+{
+ struct kref kref;
+
+ kref_init(&kref);
+
+ test("1", "%pAk", &kref);
+}
+
+static void __init test_atomics(void)
+{
+ test_atomics__atomic_t();
+ test_atomics__kref();
+}
+
static void __init
test_pointer(void)
{
@@ -481,6 +508,7 @@ test_pointer(void)
bitmap();
netdev_features();
flags();
+ test_atomics();
}

static int __init
Index: tip/lib/vsprintf.c
===================================================================
--- tip.orig/lib/vsprintf.c
+++ tip/lib/vsprintf.c
@@ -38,6 +38,8 @@

#include "../mm/internal.h" /* For the trace_print_flags arrays */

+#include <linux/kref.h>
+
#include <asm/page.h> /* for PAGE_SIZE */
#include <asm/sections.h> /* for dereference_function_descriptor() */
#include <asm/byteorder.h> /* cpu_to_le16 */
@@ -1470,6 +1472,40 @@ char *flags_string(char *buf, char *end,
return format_flags(buf, end, flags, names);
}

+static noinline_for_stack
+char *atomic_var(char *buf, char *end, void *atomic_ptr, const char *fmt)
+{
+ unsigned long num;
+ const struct printf_spec numspec = {
+ .flags = SPECIAL|SMALL,
+ .field_width = -1,
+ .precision = -1,
+ .base = 10,
+ };
+
+ switch (fmt[1]) {
+ case 'r':
+ {
+ atomic_t *count_p = (void *)atomic_ptr;
+
+ num = atomic_read(count_p);
+ break;
+ }
+ case 'k':
+ {
+ struct kref *kref_p = (void *)atomic_ptr;
+
+ num = refcount_read(&kref_p->refcount);
+ break;
+ }
+ default:
+ WARN_ONCE(1, "Unsupported atomics modifier: %c\n", fmt[1]);
+ return buf;
+ }
+
+ return number(buf, end, num, numspec);
+}
+
int kptr_restrict __read_mostly;

/*
@@ -1563,6 +1599,10 @@ int kptr_restrict __read_mostly;
* p page flags (see struct page) given as pointer to unsigned long
* g gfp flags (GFP_* and __GFP_*) given as pointer to gfp_t
* v vma flags (VM_*) given as pointer to unsigned long
+ * - 'A' For the count of atomic variables to be printed.
+ * Supported flags given by option:
+ * r atomic_t ('r'aw count)
+ * k struct kref ('k'ref count)
*
* ** Please update also Documentation/printk-formats.txt when making changes **
*
@@ -1718,6 +1758,8 @@ char *pointer(const char *fmt, char *buf

case 'G':
return flags_string(buf, end, ptr, fmt);
+ case 'A':
+ return atomic_var(buf, end, ptr, fmt);
}
spec.flags |= SMALL;
if (spec.field_width == -1) {

2016-11-15 09:21:40

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2] printk, locking/atomics, kref: Introduce new %pAr and %pAk format string options for atomic_t and 'struct kref'

On Tue, Nov 15, 2016 at 09:43:55AM +0100, Ingo Molnar wrote:
> +atomic variables such atomic_t or struct kref:
> +
> + %pAr atomic_t count

Why 'r' for atomic_t ? I was expecting 'a' for atomic_t or something.
That then also leaves 'r' available for refcount_t.

> + %pAk struct kref count

2016-11-15 09:42:02

by Ingo Molnar

[permalink] [raw]
Subject: [PATCH v3] printk, locking/atomics, kref: Introduce new %pAa and %pAk format string options for atomic_t and 'struct kref'


* Peter Zijlstra <[email protected]> wrote:

> On Tue, Nov 15, 2016 at 09:43:55AM +0100, Ingo Molnar wrote:
> > +atomic variables such atomic_t or struct kref:
> > +
> > + %pAr atomic_t count
>
> Why 'r' for atomic_t ? I was expecting 'a' for atomic_t or something.
> That then also leaves 'r' available for refcount_t.

'r' was for 'raw atomic count', but you are right - new patch attached below.

Thanks,

Ingo

==========
Subject: printk, locking/atomics, kref: Introduce new %pAa and %pAk format string options for atomic_t and 'struct kref'
From: Ingo Molnar <[email protected]>
Date: Tue, 15 Nov 2016 09:43:55 +0100

A decade of kref internals exposed to driver writers has proven that
exposing internals to them is a bad idea.

Make the bad patterns a bit easier to detect and allow cleaner
printouts by offering two new printk format string extensions:

%pAa - print the atomic_t count in decimal
%pAk - print the struct kref count in decimal

Also add printf testcases:

[ 0.334919] test_printf: all 268 tests passed

Cc: Arnaldo Carvalho de Melo <[email protected]>
Cc: Christoph Hellwig <[email protected]>
Cc: Greg Kroah-Hartman <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
Documentation/printk-formats.txt | 10 +++++++++
lib/test_printf.c | 28 ++++++++++++++++++++++++++
lib/vsprintf.c | 42 +++++++++++++++++++++++++++++++++++++++
3 files changed, 80 insertions(+)

Index: tip/Documentation/printk-formats.txt
===================================================================
--- tip.orig/Documentation/printk-formats.txt
+++ tip/Documentation/printk-formats.txt
@@ -316,6 +316,16 @@ Flags bitfields such as page flags, gfp_

Passed by reference.

+atomic variables such atomic_t or struct kref:
+
+ %pAa atomic_t count
+ %pAk struct kref count
+
+ For printing the current count value of atomic variables. This is
+ preferred to accessing the counts directly.
+
+ Passed by reference.
+
Network device features:

%pNF 0x000000000000c000
Index: tip/lib/test_printf.c
===================================================================
--- tip.orig/lib/test_printf.c
+++ tip/lib/test_printf.c
@@ -20,6 +20,8 @@
#include <linux/gfp.h>
#include <linux/mm.h>

+#include <linux/kref.h>
+
#define BUF_SIZE 256
#define PAD_SIZE 16
#define FILL_CHAR '$'
@@ -462,6 +464,31 @@ flags(void)
kfree(cmp_buffer);
}

+/*
+ * Testcases for %pAa (atomic_t) and %pAk (struct kref) count printing:
+ */
+static void __init test_atomics__atomic_t(void)
+{
+ atomic_t count = ATOMIC_INIT(1);
+
+ test("1", "%pAa", &count);
+}
+
+static void __init test_atomics__kref(void)
+{
+ struct kref kref;
+
+ kref_init(&kref);
+
+ test("1", "%pAk", &kref);
+}
+
+static void __init test_atomics(void)
+{
+ test_atomics__atomic_t();
+ test_atomics__kref();
+}
+
static void __init
test_pointer(void)
{
@@ -481,6 +508,7 @@ test_pointer(void)
bitmap();
netdev_features();
flags();
+ test_atomics();
}

static int __init
Index: tip/lib/vsprintf.c
===================================================================
--- tip.orig/lib/vsprintf.c
+++ tip/lib/vsprintf.c
@@ -38,6 +38,8 @@

#include "../mm/internal.h" /* For the trace_print_flags arrays */

+#include <linux/kref.h>
+
#include <asm/page.h> /* for PAGE_SIZE */
#include <asm/sections.h> /* for dereference_function_descriptor() */
#include <asm/byteorder.h> /* cpu_to_le16 */
@@ -1470,6 +1472,40 @@ char *flags_string(char *buf, char *end,
return format_flags(buf, end, flags, names);
}

+static noinline_for_stack
+char *atomic_var(char *buf, char *end, void *atomic_ptr, const char *fmt)
+{
+ unsigned long num;
+ const struct printf_spec numspec = {
+ .flags = SPECIAL|SMALL,
+ .field_width = -1,
+ .precision = -1,
+ .base = 10,
+ };
+
+ switch (fmt[1]) {
+ case 'a':
+ {
+ atomic_t *count_p = (void *)atomic_ptr;
+
+ num = atomic_read(count_p);
+ break;
+ }
+ case 'k':
+ {
+ struct kref *kref_p = (void *)atomic_ptr;
+
+ num = refcount_read(&kref_p->refcount);
+ break;
+ }
+ default:
+ WARN_ONCE(1, "Unsupported atomics modifier: %c\n", fmt[1]);
+ return buf;
+ }
+
+ return number(buf, end, num, numspec);
+}
+
int kptr_restrict __read_mostly;

/*
@@ -1563,6 +1599,10 @@ int kptr_restrict __read_mostly;
* p page flags (see struct page) given as pointer to unsigned long
* g gfp flags (GFP_* and __GFP_*) given as pointer to gfp_t
* v vma flags (VM_*) given as pointer to unsigned long
+ * - 'A' For the count of atomic variables to be printed.
+ * Supported flags given by option:
+ * a atomic_t ('a'tomic count)
+ * k struct kref ('k'ref count)
*
* ** Please update also Documentation/printk-formats.txt when making changes **
*
@@ -1718,6 +1758,8 @@ char *pointer(const char *fmt, char *buf

case 'G':
return flags_string(buf, end, ptr, fmt);
+ case 'A':
+ return atomic_var(buf, end, ptr, fmt);
}
spec.flags |= SMALL;
if (spec.field_width == -1) {

2016-11-15 10:04:10

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH v2] printk, locking/atomics, kref: Introduce new %pAr and %pAk format string options for atomic_t and 'struct kref'

Hi Ingo,

[auto build test ERROR on linus/master]
[also build test ERROR on v4.9-rc5 next-20161115]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url: https://github.com/0day-ci/linux/commits/Ingo-Molnar/printk-locking-atomics-kref-Introduce-new-pAr-and-pAk-format-string-options-for-atomic_t-and-struct-kref/20161115-174900
config: i386-randconfig-x006-201646 (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
# save the attached .config to linux build tree
make ARCH=i386

All errors (new ones prefixed by >>):

lib/vsprintf.c: In function 'atomic_var':
>> lib/vsprintf.c:1498:10: error: implicit declaration of function 'refcount_read' [-Werror=implicit-function-declaration]
num = refcount_read(&kref_p->refcount);
^~~~~~~~~~~~~
cc1: some warnings being treated as errors

vim +/refcount_read +1498 lib/vsprintf.c

1492 break;
1493 }
1494 case 'k':
1495 {
1496 struct kref *kref_p = (void *)atomic_ptr;
1497
> 1498 num = refcount_read(&kref_p->refcount);
1499 break;
1500 }
1501 default:

---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation


Attachments:
(No filename) (1.29 kB)
.config.gz (27.22 kB)
Download all attachments

2016-11-15 16:42:44

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] printk, locking/atomics, kref: Introduce new %pAr and %pAk format string options for atomic_t and 'struct kref'

On Tue, Nov 15, 2016 at 12:37 AM, Ingo Molnar <[email protected]> wrote:
> +atomic variables such atomic_t or struct kref:
> +
> + %pAr atomic_t count
> + %pAk struct kref count

Not a huge fan. That "r" makes sense to you ("raw" atomic), but it
makes no sense to a user. An atomic isn't "raw" to anybody else. It's
just an atomic.

Also, we have 'atomic64_t", which this doesn't cover at all.

I'd suggest just %pA, %pA64, %pAkref or something. Which leaves us the
choice to add more atomic versions later without having to make up
random one-letter things that make no sense.

Linus

2016-11-15 20:53:42

by Kees Cook

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/7] kref: Add kref_read()

On Tue, Nov 15, 2016 at 12:03 AM, Peter Zijlstra <[email protected]> wrote:
> On Tue, Nov 15, 2016 at 08:33:22AM +0100, Greg KH wrote:
>> On Mon, Nov 14, 2016 at 06:39:48PM +0100, Peter Zijlstra wrote:
>
>> > --- a/drivers/block/drbd/drbd_req.c
>> > +++ b/drivers/block/drbd/drbd_req.c
>> > @@ -520,7 +520,7 @@ static void mod_rq_state(struct drbd_req
>> > /* Completion does it's own kref_put. If we are going to
>> > * kref_sub below, we need req to be still around then. */
>> > int at_least = k_put + !!c_put;
>> > - int refcount = atomic_read(&req->kref.refcount);
>> > + int refcount = kref_read(&req->kref);
>> > if (refcount < at_least)
>> > drbd_err(device,
>> > "mod_rq_state: Logic BUG: %x -> %x: refcount = %d, should be >= %d\n",
>>
>> As proof of "things you should never do", here is one such example.
>>
>> ugh.
>>
>>
>> > --- a/drivers/block/virtio_blk.c
>> > +++ b/drivers/block/virtio_blk.c
>> > @@ -767,7 +767,7 @@ static void virtblk_remove(struct virtio
>> > /* Stop all the virtqueues. */
>> > vdev->config->reset(vdev);
>> >
>> > - refc = atomic_read(&disk_to_dev(vblk->disk)->kobj.kref.refcount);
>> > + refc = kref_read(&disk_to_dev(vblk->disk)->kobj.kref);
>> > put_disk(vblk->disk);
>> > vdev->config->del_vqs(vdev);
>> > kfree(vblk->vqs);
>>
>> And this too, ugh, that's a huge abuse and is probably totally wrong...
>>
>> thanks again for digging through this crap. I wonder if we need to name
>> the kref reference variable "do_not_touch_this_ever" or some such thing
>> to catch all of the people who try to be "too smart".
>
> There's unimaginable bong hits involved in this stuff, in the end I
> resorted to brute force and scripts to convert all this.

What should we do about things like this (bpf_prog_put() and callbacks
from kernel/bpf/syscall.c):


static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
{
struct user_struct *user = prog->aux->user;

atomic_long_sub(prog->pages, &user->locked_vm);
free_uid(user);
}

static void __bpf_prog_put_rcu(struct rcu_head *rcu)
{
struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);

free_used_maps(aux);
bpf_prog_uncharge_memlock(aux->prog);
bpf_prog_free(aux->prog);
}

void bpf_prog_put(struct bpf_prog *prog)
{
if (atomic_dec_and_test(&prog->aux->refcnt))
call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
}


Not only do we want to protect prog->aux->refcnt, but I think we want
to protect user->locked_vm too ... I don't think it's sane for
user->locked_vm to be a stats_t ?

-Kees

--
Kees Cook
Nexus Security

2016-11-16 08:14:03

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] printk, locking/atomics, kref: Introduce new %pAr and %pAk format string options for atomic_t and 'struct kref'


* Linus Torvalds <[email protected]> wrote:

> On Tue, Nov 15, 2016 at 12:37 AM, Ingo Molnar <[email protected]> wrote:
> > +atomic variables such atomic_t or struct kref:
> > +
> > + %pAr atomic_t count
> > + %pAk struct kref count
>
> Not a huge fan. That "r" makes sense to you ("raw" atomic), but it
> makes no sense to a user. An atomic isn't "raw" to anybody else. It's
> just an atomic.

So in the latestest patch this has evolved to:

%pAa - print the 'atomic_t' count in decimal
%pAk - print the 'struct kref' count in decimal
%pAr - print the 'refcount_t' count in decimal

... are you still hating it?

> Also, we have 'atomic64_t", which this doesn't cover at all.

We could use a somewhat logical letter for atomic64_t too:

%pAA - print the 'atomic64_t' count in decimal

... as 'A' is the bigger version of 'a', just like atomic64_t is the bigger
version of atomic_t! ;-)

> I'd suggest just %pA, %pA64, %pAkref or something. Which leaves us the
> choice to add more atomic versions later without having to make up
> random one-letter things that make no sense.

It's a bit more work, but we could do that too, if you still don't like the above
single letter abbreviations.

Thanks,

Ingo

2016-11-16 08:21:43

by Greg Kroah-Hartman

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/7] kref: Add kref_read()

On Tue, Nov 15, 2016 at 12:53:35PM -0800, Kees Cook wrote:
> On Tue, Nov 15, 2016 at 12:03 AM, Peter Zijlstra <[email protected]> wrote:
> > On Tue, Nov 15, 2016 at 08:33:22AM +0100, Greg KH wrote:
> >> On Mon, Nov 14, 2016 at 06:39:48PM +0100, Peter Zijlstra wrote:
> >
> >> > --- a/drivers/block/drbd/drbd_req.c
> >> > +++ b/drivers/block/drbd/drbd_req.c
> >> > @@ -520,7 +520,7 @@ static void mod_rq_state(struct drbd_req
> >> > /* Completion does it's own kref_put. If we are going to
> >> > * kref_sub below, we need req to be still around then. */
> >> > int at_least = k_put + !!c_put;
> >> > - int refcount = atomic_read(&req->kref.refcount);
> >> > + int refcount = kref_read(&req->kref);
> >> > if (refcount < at_least)
> >> > drbd_err(device,
> >> > "mod_rq_state: Logic BUG: %x -> %x: refcount = %d, should be >= %d\n",
> >>
> >> As proof of "things you should never do", here is one such example.
> >>
> >> ugh.
> >>
> >>
> >> > --- a/drivers/block/virtio_blk.c
> >> > +++ b/drivers/block/virtio_blk.c
> >> > @@ -767,7 +767,7 @@ static void virtblk_remove(struct virtio
> >> > /* Stop all the virtqueues. */
> >> > vdev->config->reset(vdev);
> >> >
> >> > - refc = atomic_read(&disk_to_dev(vblk->disk)->kobj.kref.refcount);
> >> > + refc = kref_read(&disk_to_dev(vblk->disk)->kobj.kref);
> >> > put_disk(vblk->disk);
> >> > vdev->config->del_vqs(vdev);
> >> > kfree(vblk->vqs);
> >>
> >> And this too, ugh, that's a huge abuse and is probably totally wrong...
> >>
> >> thanks again for digging through this crap. I wonder if we need to name
> >> the kref reference variable "do_not_touch_this_ever" or some such thing
> >> to catch all of the people who try to be "too smart".
> >
> > There's unimaginable bong hits involved in this stuff, in the end I
> > resorted to brute force and scripts to convert all this.
>
> What should we do about things like this (bpf_prog_put() and callbacks
> from kernel/bpf/syscall.c):
>
>
> static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
> {
> struct user_struct *user = prog->aux->user;
>
> atomic_long_sub(prog->pages, &user->locked_vm);

Oh that's scary. Let's just make one reference count rely on another
one and not check things...

> free_uid(user);
> }
>
> static void __bpf_prog_put_rcu(struct rcu_head *rcu)
> {
> struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
>
> free_used_maps(aux);
> bpf_prog_uncharge_memlock(aux->prog);
> bpf_prog_free(aux->prog);
> }
>
> void bpf_prog_put(struct bpf_prog *prog)
> {
> if (atomic_dec_and_test(&prog->aux->refcnt))
> call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
> }
>
>
> Not only do we want to protect prog->aux->refcnt, but I think we want
> to protect user->locked_vm too ... I don't think it's sane for
> user->locked_vm to be a stats_t ?

I don't think this is sane code...

greg k-h

2016-11-16 10:09:44

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/7] kref: Add kref_read()

On Tue, Nov 15, 2016 at 12:53:35PM -0800, Kees Cook wrote:
>
> What should we do about things like this (bpf_prog_put() and callbacks
> from kernel/bpf/syscall.c):
>
>
> static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
> {
> struct user_struct *user = prog->aux->user;
>
> atomic_long_sub(prog->pages, &user->locked_vm);
> free_uid(user);
> }
>
> static void __bpf_prog_put_rcu(struct rcu_head *rcu)
> {
> struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
>
> free_used_maps(aux);
> bpf_prog_uncharge_memlock(aux->prog);
> bpf_prog_free(aux->prog);
> }
>
> void bpf_prog_put(struct bpf_prog *prog)
> {
> if (atomic_dec_and_test(&prog->aux->refcnt))
> call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
> }
>
>
> Not only do we want to protect prog->aux->refcnt, but I think we want
> to protect user->locked_vm too ... I don't think it's sane for
> user->locked_vm to be a stats_t ?

Why would you want to mess with locked_vm? You seem of the opinion that
everything atomic_t is broken, this isn't the case.

2016-11-16 10:11:01

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/7] kref: Add kref_read()

On Wed, Nov 16, 2016 at 09:21:51AM +0100, Greg KH wrote:
> > What should we do about things like this (bpf_prog_put() and callbacks
> > from kernel/bpf/syscall.c):
> >
> >
> > static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
> > {
> > struct user_struct *user = prog->aux->user;
> >
> > atomic_long_sub(prog->pages, &user->locked_vm);
>
> Oh that's scary. Let's just make one reference count rely on another
> one and not check things...

Its not a reference count, its a resource limit thingy. Also, isn't
stacking, or in general building an object graph, the entire point of
reference counts?

> > free_uid(user);
> > }
> >
> > static void __bpf_prog_put_rcu(struct rcu_head *rcu)
> > {
> > struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
> >
> > free_used_maps(aux);
> > bpf_prog_uncharge_memlock(aux->prog);
> > bpf_prog_free(aux->prog);
> > }
> >
> > void bpf_prog_put(struct bpf_prog *prog)
> > {
> > if (atomic_dec_and_test(&prog->aux->refcnt))
> > call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
> > }
> >
> >
> > Not only do we want to protect prog->aux->refcnt, but I think we want
> > to protect user->locked_vm too ... I don't think it's sane for
> > user->locked_vm to be a stats_t ?
>
> I don't think this is sane code...

I once again fail to see any problems. That code is fine.

2016-11-16 10:12:23

by Daniel Borkmann

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/7] kref: Add kref_read()

On 11/16/2016 09:21 AM, Greg KH wrote:
> On Tue, Nov 15, 2016 at 12:53:35PM -0800, Kees Cook wrote:
>> On Tue, Nov 15, 2016 at 12:03 AM, Peter Zijlstra <[email protected]> wrote:
>>> On Tue, Nov 15, 2016 at 08:33:22AM +0100, Greg KH wrote:
>>>> On Mon, Nov 14, 2016 at 06:39:48PM +0100, Peter Zijlstra wrote:
>>>
>>>>> --- a/drivers/block/drbd/drbd_req.c
>>>>> +++ b/drivers/block/drbd/drbd_req.c
>>>>> @@ -520,7 +520,7 @@ static void mod_rq_state(struct drbd_req
>>>>> /* Completion does it's own kref_put. If we are going to
>>>>> * kref_sub below, we need req to be still around then. */
>>>>> int at_least = k_put + !!c_put;
>>>>> - int refcount = atomic_read(&req->kref.refcount);
>>>>> + int refcount = kref_read(&req->kref);
>>>>> if (refcount < at_least)
>>>>> drbd_err(device,
>>>>> "mod_rq_state: Logic BUG: %x -> %x: refcount = %d, should be >= %d\n",
>>>>
>>>> As proof of "things you should never do", here is one such example.
>>>>
>>>> ugh.
>>>>
>>>>> --- a/drivers/block/virtio_blk.c
>>>>> +++ b/drivers/block/virtio_blk.c
>>>>> @@ -767,7 +767,7 @@ static void virtblk_remove(struct virtio
>>>>> /* Stop all the virtqueues. */
>>>>> vdev->config->reset(vdev);
>>>>>
>>>>> - refc = atomic_read(&disk_to_dev(vblk->disk)->kobj.kref.refcount);
>>>>> + refc = kref_read(&disk_to_dev(vblk->disk)->kobj.kref);
>>>>> put_disk(vblk->disk);
>>>>> vdev->config->del_vqs(vdev);
>>>>> kfree(vblk->vqs);
>>>>
>>>> And this too, ugh, that's a huge abuse and is probably totally wrong...
>>>>
>>>> thanks again for digging through this crap. I wonder if we need to name
>>>> the kref reference variable "do_not_touch_this_ever" or some such thing
>>>> to catch all of the people who try to be "too smart".
>>>
>>> There's unimaginable bong hits involved in this stuff, in the end I
>>> resorted to brute force and scripts to convert all this.
>>
>> What should we do about things like this (bpf_prog_put() and callbacks
>> from kernel/bpf/syscall.c):

Just reading up on this series. Your question refers to converting bpf
prog and map ref counts to Peter's refcount_t eventually, right?

>> static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
>> {
>> struct user_struct *user = prog->aux->user;
>>
>> atomic_long_sub(prog->pages, &user->locked_vm);
>
> Oh that's scary. Let's just make one reference count rely on another
> one and not check things...

Sorry, could you elaborate what you mean by 'check things', you mean for
wrap around? IIUC, back then accounting was roughly similar modeled after
perf event's one, and in this case accounts for pages used by progs and
maps during their life-time. Are you suggesting that this approach is
inherently broken?

>> free_uid(user);
>> }
>>
>> static void __bpf_prog_put_rcu(struct rcu_head *rcu)
>> {
>> struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
>>
>> free_used_maps(aux);
>> bpf_prog_uncharge_memlock(aux->prog);
>> bpf_prog_free(aux->prog);
>> }
>>
>> void bpf_prog_put(struct bpf_prog *prog)
>> {
>> if (atomic_dec_and_test(&prog->aux->refcnt))
>> call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
>> }
>>
>>
>> Not only do we want to protect prog->aux->refcnt, but I think we want
>> to protect user->locked_vm too ... I don't think it's sane for
>> user->locked_vm to be a stats_t ?
>
> I don't think this is sane code...
>
> greg k-h

2016-11-16 10:18:28

by Greg Kroah-Hartman

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/7] kref: Add kref_read()

On Wed, Nov 16, 2016 at 11:10:42AM +0100, Peter Zijlstra wrote:
> On Wed, Nov 16, 2016 at 09:21:51AM +0100, Greg KH wrote:
> > > What should we do about things like this (bpf_prog_put() and callbacks
> > > from kernel/bpf/syscall.c):
> > >
> > >
> > > static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
> > > {
> > > struct user_struct *user = prog->aux->user;
> > >
> > > atomic_long_sub(prog->pages, &user->locked_vm);
> >
> > Oh that's scary. Let's just make one reference count rely on another
> > one and not check things...
>
> Its not a reference count, its a resource limit thingy. Also, isn't
> stacking, or in general building an object graph, the entire point of
> reference counts?

Ah, that wasn't obvious, but yes, you are correct here, sorry for the
noise.

greg k-h

2016-11-16 10:19:06

by Greg Kroah-Hartman

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/7] kref: Add kref_read()

On Wed, Nov 16, 2016 at 11:11:43AM +0100, Daniel Borkmann wrote:
> On 11/16/2016 09:21 AM, Greg KH wrote:
> > On Tue, Nov 15, 2016 at 12:53:35PM -0800, Kees Cook wrote:
> > > On Tue, Nov 15, 2016 at 12:03 AM, Peter Zijlstra <[email protected]> wrote:
> > > > On Tue, Nov 15, 2016 at 08:33:22AM +0100, Greg KH wrote:
> > > > > On Mon, Nov 14, 2016 at 06:39:48PM +0100, Peter Zijlstra wrote:
> > > >
> > > > > > --- a/drivers/block/drbd/drbd_req.c
> > > > > > +++ b/drivers/block/drbd/drbd_req.c
> > > > > > @@ -520,7 +520,7 @@ static void mod_rq_state(struct drbd_req
> > > > > > /* Completion does it's own kref_put. If we are going to
> > > > > > * kref_sub below, we need req to be still around then. */
> > > > > > int at_least = k_put + !!c_put;
> > > > > > - int refcount = atomic_read(&req->kref.refcount);
> > > > > > + int refcount = kref_read(&req->kref);
> > > > > > if (refcount < at_least)
> > > > > > drbd_err(device,
> > > > > > "mod_rq_state: Logic BUG: %x -> %x: refcount = %d, should be >= %d\n",
> > > > >
> > > > > As proof of "things you should never do", here is one such example.
> > > > >
> > > > > ugh.
> > > > >
> > > > > > --- a/drivers/block/virtio_blk.c
> > > > > > +++ b/drivers/block/virtio_blk.c
> > > > > > @@ -767,7 +767,7 @@ static void virtblk_remove(struct virtio
> > > > > > /* Stop all the virtqueues. */
> > > > > > vdev->config->reset(vdev);
> > > > > >
> > > > > > - refc = atomic_read(&disk_to_dev(vblk->disk)->kobj.kref.refcount);
> > > > > > + refc = kref_read(&disk_to_dev(vblk->disk)->kobj.kref);
> > > > > > put_disk(vblk->disk);
> > > > > > vdev->config->del_vqs(vdev);
> > > > > > kfree(vblk->vqs);
> > > > >
> > > > > And this too, ugh, that's a huge abuse and is probably totally wrong...
> > > > >
> > > > > thanks again for digging through this crap. I wonder if we need to name
> > > > > the kref reference variable "do_not_touch_this_ever" or some such thing
> > > > > to catch all of the people who try to be "too smart".
> > > >
> > > > There's unimaginable bong hits involved in this stuff, in the end I
> > > > resorted to brute force and scripts to convert all this.
> > >
> > > What should we do about things like this (bpf_prog_put() and callbacks
> > > from kernel/bpf/syscall.c):
>
> Just reading up on this series. Your question refers to converting bpf
> prog and map ref counts to Peter's refcount_t eventually, right?
>
> > > static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
> > > {
> > > struct user_struct *user = prog->aux->user;
> > >
> > > atomic_long_sub(prog->pages, &user->locked_vm);
> >
> > Oh that's scary. Let's just make one reference count rely on another
> > one and not check things...
>
> Sorry, could you elaborate what you mean by 'check things', you mean for
> wrap around? IIUC, back then accounting was roughly similar modeled after
> perf event's one, and in this case accounts for pages used by progs and
> maps during their life-time. Are you suggesting that this approach is
> inherently broken?

No, it is correct, I responded too quickly before my morning coffee had
kicked in, my apologies.

greg k-h

2016-11-16 18:58:42

by Kees Cook

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/7] kref: Add kref_read()

On Wed, Nov 16, 2016 at 2:09 AM, Peter Zijlstra <[email protected]> wrote:
> On Tue, Nov 15, 2016 at 12:53:35PM -0800, Kees Cook wrote:
>>
>> What should we do about things like this (bpf_prog_put() and callbacks
>> from kernel/bpf/syscall.c):
>>
>>
>> static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
>> {
>> struct user_struct *user = prog->aux->user;
>>
>> atomic_long_sub(prog->pages, &user->locked_vm);
>> free_uid(user);
>> }
>>
>> static void __bpf_prog_put_rcu(struct rcu_head *rcu)
>> {
>> struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
>>
>> free_used_maps(aux);
>> bpf_prog_uncharge_memlock(aux->prog);
>> bpf_prog_free(aux->prog);
>> }
>>
>> void bpf_prog_put(struct bpf_prog *prog)
>> {
>> if (atomic_dec_and_test(&prog->aux->refcnt))
>> call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
>> }
>>
>>
>> Not only do we want to protect prog->aux->refcnt, but I think we want
>> to protect user->locked_vm too ... I don't think it's sane for
>> user->locked_vm to be a stats_t ?
>
> Why would you want to mess with locked_vm? You seem of the opinion that
> everything atomic_t is broken, this isn't the case.

What I mean to say is that while the refcnt here should clearly be
converted to kref or refcount_t, it looks like locked_vm should become
a new stats_t. However, it seems weird for locked_vm to ever wrap
either...

-Kees

--
Kees Cook
Nexus Security

2016-11-16 20:09:15

by Alexei Starovoitov

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/7] kref: Add kref_read()

On Wed, Nov 16, 2016 at 10:58 AM, Kees Cook <[email protected]> wrote:
> On Wed, Nov 16, 2016 at 2:09 AM, Peter Zijlstra <[email protected]> wrote:
>> On Tue, Nov 15, 2016 at 12:53:35PM -0800, Kees Cook wrote:
>>>
>>> What should we do about things like this (bpf_prog_put() and callbacks
>>> from kernel/bpf/syscall.c):
>>>
>>>
>>> static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
>>> {
>>> struct user_struct *user = prog->aux->user;
>>>
>>> atomic_long_sub(prog->pages, &user->locked_vm);
>>> free_uid(user);
>>> }
>>>
>>> static void __bpf_prog_put_rcu(struct rcu_head *rcu)
>>> {
>>> struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
>>>
>>> free_used_maps(aux);
>>> bpf_prog_uncharge_memlock(aux->prog);
>>> bpf_prog_free(aux->prog);
>>> }
>>>
>>> void bpf_prog_put(struct bpf_prog *prog)
>>> {
>>> if (atomic_dec_and_test(&prog->aux->refcnt))
>>> call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
>>> }
>>>
>>>
>>> Not only do we want to protect prog->aux->refcnt, but I think we want
>>> to protect user->locked_vm too ... I don't think it's sane for
>>> user->locked_vm to be a stats_t ?
>>
>> Why would you want to mess with locked_vm? You seem of the opinion that
>> everything atomic_t is broken, this isn't the case.
>
> What I mean to say is that while the refcnt here should clearly be
> converted to kref or refcount_t, it looks like locked_vm should become
> a new stats_t. However, it seems weird for locked_vm to ever wrap
> either...

I prefer to avoid 'fixing' things that are not broken.
Note, prog->aux->refcnt already has explicit checks for overflow.
locked_vm is used for resource accounting and not refcnt,
so I don't see issues there either.

2016-11-17 08:35:08

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/7] kref: Add kref_read()

On Wed, Nov 16, 2016 at 10:58:38AM -0800, Kees Cook wrote:
> On Wed, Nov 16, 2016 at 2:09 AM, Peter Zijlstra <[email protected]> wrote:
> > On Tue, Nov 15, 2016 at 12:53:35PM -0800, Kees Cook wrote:
> >>
> >> What should we do about things like this (bpf_prog_put() and callbacks
> >> from kernel/bpf/syscall.c):
> >>
> >>
> >> static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
> >> {
> >> struct user_struct *user = prog->aux->user;
> >>
> >> atomic_long_sub(prog->pages, &user->locked_vm);
> >> free_uid(user);
> >> }
> >>
> >> static void __bpf_prog_put_rcu(struct rcu_head *rcu)
> >> {
> >> struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
> >>
> >> free_used_maps(aux);
> >> bpf_prog_uncharge_memlock(aux->prog);
> >> bpf_prog_free(aux->prog);
> >> }
> >>
> >> void bpf_prog_put(struct bpf_prog *prog)
> >> {
> >> if (atomic_dec_and_test(&prog->aux->refcnt))
> >> call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
> >> }
> >>
> >>
> >> Not only do we want to protect prog->aux->refcnt, but I think we want
> >> to protect user->locked_vm too ... I don't think it's sane for
> >> user->locked_vm to be a stats_t ?
> >
> > Why would you want to mess with locked_vm? You seem of the opinion that
> > everything atomic_t is broken, this isn't the case.
>
> What I mean to say is that while the refcnt here should clearly be
> converted to kref or refcount_t, it looks like locked_vm should become
> a new stats_t. However, it seems weird for locked_vm to ever wrap
> either...

No, its not a statistic. Also, I'm far from convinced stats_t is an
actually useful thing to have.

refcount_t brought special semantics that clearly are different from
regular atomic_t, stats_t would not, so why would it need to exist.

Not to mention that you seem over eager to apply it, which doesn't
inspire confidence.

2016-11-17 08:53:53

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/7] kref: Add kref_read()

On Wed, Nov 16, 2016 at 12:08:52PM -0800, Alexei Starovoitov wrote:

> I prefer to avoid 'fixing' things that are not broken.
> Note, prog->aux->refcnt already has explicit checks for overflow.
> locked_vm is used for resource accounting and not refcnt,
> so I don't see issues there either.

The idea is to use something along the lines of:

http://lkml.kernel.org/r/[email protected]

for all refcounts in the kernel.

Also note that your:

struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i)
{
if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) {
atomic_sub(i, &prog->aux->refcnt);
return ERR_PTR(-EBUSY);
}
return prog;
}

is actually broken in the face of an actual overflow. Suppose @i is big
enough to wrap refcnt into negative space.

Also, the current sentiment is to strongly discourage add/sub operations
for refcounts.

2016-11-17 12:45:39

by David Windsor

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/7] kref: Add kref_read()

On Thu, Nov 17, 2016 at 3:34 AM, Peter Zijlstra <[email protected]> wrote:
> On Wed, Nov 16, 2016 at 10:58:38AM -0800, Kees Cook wrote:
>> On Wed, Nov 16, 2016 at 2:09 AM, Peter Zijlstra <[email protected]> wrote:
>> > On Tue, Nov 15, 2016 at 12:53:35PM -0800, Kees Cook wrote:
>> >>
>> >> What should we do about things like this (bpf_prog_put() and callbacks
>> >> from kernel/bpf/syscall.c):
>> >>
>> >>
>> >> static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
>> >> {
>> >> struct user_struct *user = prog->aux->user;
>> >>
>> >> atomic_long_sub(prog->pages, &user->locked_vm);
>> >> free_uid(user);
>> >> }
>> >>
>> >> static void __bpf_prog_put_rcu(struct rcu_head *rcu)
>> >> {
>> >> struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
>> >>
>> >> free_used_maps(aux);
>> >> bpf_prog_uncharge_memlock(aux->prog);
>> >> bpf_prog_free(aux->prog);
>> >> }
>> >>
>> >> void bpf_prog_put(struct bpf_prog *prog)
>> >> {
>> >> if (atomic_dec_and_test(&prog->aux->refcnt))
>> >> call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
>> >> }
>> >>
>> >>
>> >> Not only do we want to protect prog->aux->refcnt, but I think we want
>> >> to protect user->locked_vm too ... I don't think it's sane for
>> >> user->locked_vm to be a stats_t ?
>> >
>> > Why would you want to mess with locked_vm? You seem of the opinion that
>> > everything atomic_t is broken, this isn't the case.
>>
>> What I mean to say is that while the refcnt here should clearly be
>> converted to kref or refcount_t, it looks like locked_vm should become
>> a new stats_t. However, it seems weird for locked_vm to ever wrap
>> either...
>
> No, its not a statistic. Also, I'm far from convinced stats_t is an
> actually useful thing to have.
>

Regarding this, has there been any thought given as to how stats_t
will meaningfully differ from atomic_t? If refcount_t is semantically
"atomic_t with reference counter overflow protection," what
services/guarantees does stats_t provide? I cannot think of any that
don't require implementing overflow detection of some sort, which
incurs a performance hit.

One conceivable service/guarantee would be to give stats_t the ability
to detect/report when an overflow has occurred, but not ultimately
with the offending process getting killed. On x86, this could be
done by having stats_t overflows generate a different exception number
and corresponding handler than refcount_t-generated overflows. It
would still contain the mechanisms for detecting and responding to
overflows, but the response to stats_t overflows would differ from
that of refcount_t overflows. Semantically, this version of stats_t
would be "refcount_t minus 'kill the offending process'." I'm not
sure if this abstraction is in fact useful, or indeed worth the
requisite performance hit; I'm just suggesting a possible semantic
difference between atomic_t and stats_t.

> refcount_t brought special semantics that clearly are different from
> regular atomic_t, stats_t would not, so why would it need to exist.
>
> Not to mention that you seem over eager to apply it, which doesn't
> inspire confidence.

2016-11-17 14:30:38

by Elena Reshetova

[permalink] [raw]
Subject: RE: [RFC][PATCH 2/7] kref: Add kref_read()


On Thu, Nov 17, 2016 at 07:30:29AM -0500, David Windsor wrote:
> On Thu, Nov 17, 2016 at 3:34 AM, Peter Zijlstra <[email protected]> wrote:
> > No, its not a statistic. Also, I'm far from convinced stats_t is an
> > actually useful thing to have.
> >
>
> Regarding this, has there been any thought given as to how stats_t
> will meaningfully differ from atomic_t? If refcount_t is semantically
> "atomic_t with reference counter overflow protection," what
> services/guarantees does stats_t provide? I cannot think of any that
> don't require implementing overflow detection of some sort, which
> incurs a performance hit.

>Afaict the whole point of stats_t was to allow overflow, since its only stats, nobody cares etc..

>I think the sole motivator is a general distaste of atomic_t, which isn't a good reason at all.

I don't think anyone has this as motivation. But atomic_t is so powerful and flexible that easily ends up being misused (as past CVEs shown).
Even if we now find all occurrences of atomic_t used as refcounter (which we cannot actually guarantee in any case unless someone manually reads every line)
and convert it to refcount_t, we still have atomic_t type present and new usage of it as refount will crawl in. It is just a matter of time IMO.

So, this approach still doesn't solve the main problem: abuse of atomic_t a refcounter and security vulnerabilities as result.
What other mechanisms can we think we can utilize to prevent it?
- Checkpatch? Would be hard to write enough rules to find all possible patterns how creative people might use atomic as refcounter.
- People reviewing the code? Many kernel vulnerabilities live outside of core kernel, where maintainers are careful about what gets in and what's not. Further you go from core kernel (especially when you reach non-upstream drivers), code review quality is less, possibility of mistake is higher, and on average this code has more vulnerabilities. We can't say "this is not upstream code, who cares", because we want Linux kernel to follow "secure by default" principle: to provide enough mechanisms in kernel itself to minimize risk of mistakes and vulnerabilities. I think atomic is a great example of such case. We need to make it hard for people to make mistakes with overflows when overflows actually matter.
This was really a reason for our initial approach that provided "security by default". Certainly it had some issues (we all agree on this), but let's think how else can we provide "secure by default" protection for this.



2016-11-17 17:05:58

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/7] kref: Add kref_read()

On Thu, Nov 17, 2016 at 01:01:49PM +0000, Reshetova, Elena wrote:

> >I think the sole motivator is a general distaste of atomic_t, which isn't a good reason at all.
>
> I don't think anyone has this as motivation. But atomic_t is so
> powerful and flexible that easily ends up being misused (as past CVEs
> shown).

I don't think using atomic_t as reference count is abuse. There simply
wasn't anything better. The proposed refcount_t cures this.

> Even if we now find all occurrences of atomic_t used as
> refcounter (which we cannot actually guarantee in any case unless
> someone manually reads every line) and convert it to refcount_t, we
> still have atomic_t type present and new usage of it as refount will
> crawl in. It is just a matter of time IMO.

Improve tooling. The patterns shouldn't be _that_ hard to find. Once the
tools are good, new code isn't a problem either.

Anything: atomic*_{{dec,sub}_and_test,{add,sub}_return,fetch_{add,sub}}
followed by a call_rcu()/free().

2016-11-17 17:07:53

by Elena Reshetova

[permalink] [raw]
Subject: RE: [RFC][PATCH 2/7] kref: Add kref_read()

> Even if we now find all occurrences of atomic_t used as refcounter
> (which we cannot actually guarantee in any case unless someone
> manually reads every line) and convert it to refcount_t, we still have
> atomic_t type present and new usage of it as refount will crawl in. It
> is just a matter of time IMO.

>Improve tooling. The patterns shouldn't be _that_ hard to find. Once the tools are good, new code isn't a problem either.

>Anything: atomic*_{{dec,sub}_and_test,{add,sub}_return,fetch_{add,sub}}
>followed by a call_rcu()/free().

Does not find everything unfortunately. Even if you add to above atomic*_add_unless() and also things like schedule_work(), still I fear we aren't covering everything.
What is worse, I don't think there is a mechanism to guarantee full coverage.



2016-11-17 17:19:33

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/7] kref: Add kref_read()

On Thu, Nov 17, 2016 at 07:30:29AM -0500, David Windsor wrote:
> On Thu, Nov 17, 2016 at 3:34 AM, Peter Zijlstra <[email protected]> wrote:
> > No, its not a statistic. Also, I'm far from convinced stats_t is an
> > actually useful thing to have.
> >
>
> Regarding this, has there been any thought given as to how stats_t
> will meaningfully differ from atomic_t? If refcount_t is semantically
> "atomic_t with reference counter overflow protection," what
> services/guarantees does stats_t provide? I cannot think of any that
> don't require implementing overflow detection of some sort, which
> incurs a performance hit.

Afaict the whole point of stats_t was to allow overflow, since its only
stats, nobody cares etc..

I think the sole motivator is a general distaste of atomic_t, which
isn't a good reason at all.

2016-11-17 17:19:24

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/7] kref: Add kref_read()

On Thu, 17 Nov 2016, Alexei Starovoitov wrote:
> On Thu, Nov 17, 2016 at 09:53:42AM +0100, Peter Zijlstra wrote:
> > On Wed, Nov 16, 2016 at 12:08:52PM -0800, Alexei Starovoitov wrote:
> >
> > > I prefer to avoid 'fixing' things that are not broken.
> > > Note, prog->aux->refcnt already has explicit checks for overflow.
> > > locked_vm is used for resource accounting and not refcnt,
> > > so I don't see issues there either.
> >
> > The idea is to use something along the lines of:
> >
> > http://lkml.kernel.org/r/[email protected]
> >
> > for all refcounts in the kernel.
>
> I understand the idea. I'm advocating to fix refcnts
> explicitly the way we did in bpf land instead of leaking memory,
> making processes unkillable and so on.
> If refcnt can be bounds checked, it should be done that way, since
> it's a clean error path without odd side effects.
> Therefore I'm against unconditionally applying refcount to all atomics.
>
> > Also note that your:
> >
> > struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i)
> > {
> > if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) {
> > atomic_sub(i, &prog->aux->refcnt);
> > return ERR_PTR(-EBUSY);
> > }
> > return prog;
> > }
> >
> > is actually broken in the face of an actual overflow. Suppose @i is big
> > enough to wrap refcnt into negative space.
>
> 'i' is not controlled by user. It's a number of nic hw queues
> and BPF_MAX_REFCNT is 32k, so above is always safe.
>
> > Also, the current sentiment is to strongly discourage add/sub operations
> > for refcounts.
>
> I agree with this reasoning as well, but it's not hard and fast rule.
> If we know we can do 'add' safely, we should.

In principle yes. OTOH, history shows that developers have a pretty bad
judgement what is safe and not. They rather copy code from random places,
modify it in creative ways and be done with it.

Thanks,

tglx

2016-11-17 17:27:50

by Alexei Starovoitov

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/7] kref: Add kref_read()

On Thu, Nov 17, 2016 at 09:53:42AM +0100, Peter Zijlstra wrote:
> On Wed, Nov 16, 2016 at 12:08:52PM -0800, Alexei Starovoitov wrote:
>
> > I prefer to avoid 'fixing' things that are not broken.
> > Note, prog->aux->refcnt already has explicit checks for overflow.
> > locked_vm is used for resource accounting and not refcnt,
> > so I don't see issues there either.
>
> The idea is to use something along the lines of:
>
> http://lkml.kernel.org/r/[email protected]
>
> for all refcounts in the kernel.

I understand the idea. I'm advocating to fix refcnts
explicitly the way we did in bpf land instead of leaking memory,
making processes unkillable and so on.
If refcnt can be bounds checked, it should be done that way, since
it's a clean error path without odd side effects.
Therefore I'm against unconditionally applying refcount to all atomics.

> Also note that your:
>
> struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i)
> {
> if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) {
> atomic_sub(i, &prog->aux->refcnt);
> return ERR_PTR(-EBUSY);
> }
> return prog;
> }
>
> is actually broken in the face of an actual overflow. Suppose @i is big
> enough to wrap refcnt into negative space.

'i' is not controlled by user. It's a number of nic hw queues
and BPF_MAX_REFCNT is 32k, so above is always safe.

> Also, the current sentiment is to strongly discourage add/sub operations
> for refcounts.

I agree with this reasoning as well, but it's not hard and fast rule.
If we know we can do 'add' safely, we should.

2016-11-17 18:02:49

by Elena Reshetova

[permalink] [raw]
Subject: RE: [RFC][PATCH 2/7] kref: Add kref_read()


> Even if we now find all occurrences of atomic_t used as refcounter
> (which we cannot actually guarantee in any case unless someone
> manually reads every line) and convert it to refcount_t, we still have
> atomic_t type present and new usage of it as refount will crawl in. It
> is just a matter of time IMO.

>Improve tooling. The patterns shouldn't be _that_ hard to find. Once the tools are good, new code isn't a problem either.

Moreover, thinking of out of tree drivers: you think they would always do checkpatch or run some of our tools for security checks?




2016-11-17 19:10:39

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/7] kref: Add kref_read()

On Thu, Nov 17, 2016 at 06:02:33PM +0000, Reshetova, Elena wrote:
>
> > Even if we now find all occurrences of atomic_t used as refcounter
> > (which we cannot actually guarantee in any case unless someone
> > manually reads every line) and convert it to refcount_t, we still have
> > atomic_t type present and new usage of it as refount will crawl in. It
> > is just a matter of time IMO.
>
> >Improve tooling. The patterns shouldn't be _that_ hard to find. Once the tools are good, new code isn't a problem either.
>
> Moreover, thinking of out of tree drivers: you think they would always
> do checkpatch or run some of our tools for security checks?

If they can't be arsed, neither can I. You can't fix the unfixable.

Like I said before, its chasing unicorns.

2016-11-17 19:30:11

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/7] kref: Add kref_read()

On Thu, Nov 17, 2016 at 06:02:33PM +0000, Reshetova, Elena wrote:

> >Improve tooling. The patterns shouldn't be _that_ hard to find. Once the tools are good, new code isn't a problem either.
>
> Moreover, thinking of out of tree drivers: you think they would always
> do checkpatch or run some of our tools for security checks?

Also, checkpatch is a horrid example. That's mostly meaningless and
menial noise. Nobody wants to run that, even if, between all the
gibberish it lists a few sensible things.

Make an always enabled GCC plugin that generates build warns with a low
enough false positive rate and nobody will complain.

2016-11-17 19:34:53

by Kees Cook

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/7] kref: Add kref_read()

On Thu, Nov 17, 2016 at 12:34 AM, Peter Zijlstra <[email protected]> wrote:
> On Wed, Nov 16, 2016 at 10:58:38AM -0800, Kees Cook wrote:
>> What I mean to say is that while the refcnt here should clearly be
>> converted to kref or refcount_t, it looks like locked_vm should become
>> a new stats_t. However, it seems weird for locked_vm to ever wrap
>> either...
>
> No, its not a statistic. Also, I'm far from convinced stats_t is an
> actually useful thing to have.

It's useful because its introduction creates a type that can't be
trivially used for refcounting (i.e. hard to make the mistake of using
stats_t for refcounting), and replacing atomic_t statistic counters
with stats_t reduces the effort required to do the initial (and
on-going) audit for misuse of atomic_t as a refcounter.

> refcount_t brought special semantics that clearly are different from
> regular atomic_t, stats_t would not, so why would it need to exist.

Your original suggestion about stats_t showed how its accessor API
would be a very small subset of the regular atomic_t set. I think that
reduction in accidental misuse has value.

> Not to mention that you seem over eager to apply it, which doesn't
> inspire confidence.

I'd like to get to the point where auditing for mistakes in this area
is tractable. :) If atomic_t is only used for non-stats and
non-refcount, it's much much easier to examine and reason about.

-Kees

--
Kees Cook
Nexus Security

2016-11-18 17:34:02

by Elena Reshetova

[permalink] [raw]
Subject: RE: [RFC][PATCH 2/7] kref: Add kref_read()

On Thu, Nov 17, 2016 at 09:53:42AM +0100, Peter Zijlstra wrote:
> On Wed, Nov 16, 2016 at 12:08:52PM -0800, Alexei Starovoitov wrote:
>
> > I prefer to avoid 'fixing' things that are not broken.
> > Note, prog->aux->refcnt already has explicit checks for overflow.
> > locked_vm is used for resource accounting and not refcnt, so I don't
> > see issues there either.
>
> The idea is to use something along the lines of:
>
>
> http://lkml.kernel.org/r/[email protected]
> -ass.net
>
> for all refcounts in the kernel.

>I understand the idea. I'm advocating to fix refcnts explicitly the way we did in bpf land instead of leaking memory, making processes unkillable and so on.
>If refcnt can be bounds checked, it should be done that way, since it's a clean error path without odd side effects.
>Therefore I'm against unconditionally applying refcount to all atomics.

> Also note that your:
>
> struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i) {
> if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) {
> atomic_sub(i, &prog->aux->refcnt);
> return ERR_PTR(-EBUSY);
> }
> return prog;
> }
>
> is actually broken in the face of an actual overflow. Suppose @i is
> big enough to wrap refcnt into negative space.

>'i' is not controlled by user. It's a number of nic hw queues and BPF_MAX_REFCNT is 32k, so above is always safe.

If I understand your code right, you export the bpf_prog_add() and anyone is free to use it
(some crazy buggy driver for example).
Currently only drivers/net/ethernet/mellanox/mlx4/en_netdev.c uses it, but you should
consider any externally exposed interface as an attack vector from security point of view.
So, I would not claim that above construction is always safe since there is a way using API to
supply "i" that would overflow.

Next question is how to convert the above code sanely to refcount_t interface... Loop of inc(s)? Iikk...



2016-11-19 03:47:36

by Alexei Starovoitov

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/7] kref: Add kref_read()

On Fri, Nov 18, 2016 at 05:33:35PM +0000, Reshetova, Elena wrote:
> On Thu, Nov 17, 2016 at 09:53:42AM +0100, Peter Zijlstra wrote:
> > On Wed, Nov 16, 2016 at 12:08:52PM -0800, Alexei Starovoitov wrote:
> >
> > > I prefer to avoid 'fixing' things that are not broken.
> > > Note, prog->aux->refcnt already has explicit checks for overflow.
> > > locked_vm is used for resource accounting and not refcnt, so I don't
> > > see issues there either.
> >
> > The idea is to use something along the lines of:
> >
> >
> > http://lkml.kernel.org/r/[email protected]
> > -ass.net
> >
> > for all refcounts in the kernel.
>
> >I understand the idea. I'm advocating to fix refcnts explicitly the way we did in bpf land instead of leaking memory, making processes unkillable and so on.
> >If refcnt can be bounds checked, it should be done that way, since it's a clean error path without odd side effects.
> >Therefore I'm against unconditionally applying refcount to all atomics.
>
> > Also note that your:
> >
> > struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i) {
> > if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) {
> > atomic_sub(i, &prog->aux->refcnt);
> > return ERR_PTR(-EBUSY);
> > }
> > return prog;
> > }
> >
> > is actually broken in the face of an actual overflow. Suppose @i is
> > big enough to wrap refcnt into negative space.
>
> >'i' is not controlled by user. It's a number of nic hw queues and BPF_MAX_REFCNT is 32k, so above is always safe.
>
> If I understand your code right, you export the bpf_prog_add() and anyone is free to use it
> (some crazy buggy driver for example).
> Currently only drivers/net/ethernet/mellanox/mlx4/en_netdev.c uses it, but you should
> consider any externally exposed interface as an attack vector from security point of view.

It's not realistic to harden all export_symbol apis.
Code review for in-tree modules is the only defense we have.
Remember out of tree perf counter issues... nothing perf core can do
about that. If it's out of tree, it's vendor's problem to fix it.

2016-11-21 08:18:16

by Elena Reshetova

[permalink] [raw]
Subject: RE: [RFC][PATCH 2/7] kref: Add kref_read()

> On Fri, Nov 18, 2016 at 05:33:35PM +0000, Reshetova, Elena wrote:
> > On Thu, Nov 17, 2016 at 09:53:42AM +0100, Peter Zijlstra wrote:
> > > On Wed, Nov 16, 2016 at 12:08:52PM -0800, Alexei Starovoitov wrote:
> > >
> > > > I prefer to avoid 'fixing' things that are not broken.
> > > > Note, prog->aux->refcnt already has explicit checks for overflow.
> > > > locked_vm is used for resource accounting and not refcnt, so I don't
> > > > see issues there either.
> > >
> > > The idea is to use something along the lines of:
> > >
> > >
> > >
> http://lkml.kernel.org/r/[email protected]
> > > -ass.net
> > >
> > > for all refcounts in the kernel.
> >
> > >I understand the idea. I'm advocating to fix refcnts explicitly the way we did
> in bpf land instead of leaking memory, making processes unkillable and so on.
> > >If refcnt can be bounds checked, it should be done that way, since it's a
> clean error path without odd side effects.
> > >Therefore I'm against unconditionally applying refcount to all atomics.
> >
> > > Also note that your:
> > >
> > > struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i) {
> > > if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) {
> > > atomic_sub(i, &prog->aux->refcnt);
> > > return ERR_PTR(-EBUSY);
> > > }
> > > return prog;
> > > }
> > >
> > > is actually broken in the face of an actual overflow. Suppose @i is
> > > big enough to wrap refcnt into negative space.
> >
> > >'i' is not controlled by user. It's a number of nic hw queues and
> BPF_MAX_REFCNT is 32k, so above is always safe.
> >
> > If I understand your code right, you export the bpf_prog_add() and anyone is
> free to use it
> > (some crazy buggy driver for example).
> > Currently only drivers/net/ethernet/mellanox/mlx4/en_netdev.c uses it, but
> you should
> > consider any externally exposed interface as an attack vector from security
> point of view.
>
> It's not realistic to harden all export_symbol apis.
> Code review for in-tree modules is the only defense we have.
> Remember out of tree perf counter issues... nothing perf core can do
> about that. If it's out of tree, it's vendor's problem to fix it.

I am not trying to harden them all now, but since we are going through the list of
atomic_t variables that are used for refcounting, and this seems to be the one,
I was trying to find a way to convert it also since it isn't a big effort and would do
good at the end.
So, you are not fine if I convert the above code using only refcount_inc and refcount_dec_and_test
primitives?




2016-11-21 12:47:15

by David Windsor

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/7] kref: Add kref_read()

On Fri, Nov 18, 2016 at 12:33 PM, Reshetova, Elena
<[email protected]> wrote:
> On Thu, Nov 17, 2016 at 09:53:42AM +0100, Peter Zijlstra wrote:
>> On Wed, Nov 16, 2016 at 12:08:52PM -0800, Alexei Starovoitov wrote:
>>
>> > I prefer to avoid 'fixing' things that are not broken.
>> > Note, prog->aux->refcnt already has explicit checks for overflow.
>> > locked_vm is used for resource accounting and not refcnt, so I don't
>> > see issues there either.
>>
>> The idea is to use something along the lines of:
>>
>>
>> http://lkml.kernel.org/r/[email protected]
>> -ass.net
>>
>> for all refcounts in the kernel.
>
>>I understand the idea. I'm advocating to fix refcnts explicitly the way we did in bpf land instead of leaking memory, making processes unkillable and so on.
>>If refcnt can be bounds checked, it should be done that way, since it's a clean error path without odd side effects.
>>Therefore I'm against unconditionally applying refcount to all atomics.
>
>> Also note that your:
>>
>> struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i) {
>> if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) {
>> atomic_sub(i, &prog->aux->refcnt);
>> return ERR_PTR(-EBUSY);
>> }
>> return prog;
>> }
>>
>> is actually broken in the face of an actual overflow. Suppose @i is
>> big enough to wrap refcnt into negative space.
>
>>'i' is not controlled by user. It's a number of nic hw queues and BPF_MAX_REFCNT is 32k, so above is always safe.
>
> If I understand your code right, you export the bpf_prog_add() and anyone is free to use it
> (some crazy buggy driver for example).
> Currently only drivers/net/ethernet/mellanox/mlx4/en_netdev.c uses it, but you should
> consider any externally exposed interface as an attack vector from security point of view.
> So, I would not claim that above construction is always safe since there is a way using API to
> supply "i" that would overflow.
>
> Next question is how to convert the above code sanely to refcount_t interface... Loop of inc(s)? Iikk...
>

By the way, there are several sites where the use of
atomic_t/atomic_wrap_t as a counter ventures beyond the standard (inc,
dec, add, sub, read, set) operations we're planning on implementing
for both refcount_t and stats_t. While performing the conversion to
stats_t, I've found usage of atomic_xchg(), for instance. From
kernel/trace/trace_mmiotrace.c:123:

unsigned long cnt = atomic_xchg(&dropped_count, 0);

stats_xchg() isn't anticipated to go into the stats_t API, and
dropped_count clearly appears to be a statistical counter, so we will
have to further audit this site to determine whether the atomicity of
the atomic_xchg() operation is truly necessary here. If it is, we
can either decide to implement stats_xchg(), or we could use a
combination of locking, stats_read() and stats_set() to accomplish the
same thing as stats_xchg().

>
>

2016-11-21 15:39:24

by Elena Reshetova

[permalink] [raw]
Subject: RE: [RFC][PATCH 2/7] kref: Add kref_read()

> By the way, there are several sites where the use of
> atomic_t/atomic_wrap_t as a counter ventures beyond the standard (inc,
> dec, add, sub, read, set) operations we're planning on implementing
> for both refcount_t and stats_t.

Speaking of non-fitting patterns. This one is quite common in networking code for refcounters:

if (atomic_cmpxchg(&cur->refcnt, 1, 0) == 1) {}
This is from net/netfilter/nfnetlink_acct.c, but there are similar ones in other places.

Also, simple atomic_dec() is used pretty much everywhere for counters, which we don’t have a straight match in refcount_t API.


2016-11-21 15:49:35

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/7] kref: Add kref_read()

On Mon, Nov 21, 2016 at 03:39:19PM +0000, Reshetova, Elena wrote:
> > By the way, there are several sites where the use of
> > atomic_t/atomic_wrap_t as a counter ventures beyond the standard (inc,
> > dec, add, sub, read, set) operations we're planning on implementing
> > for both refcount_t and stats_t.
>
> Speaking of non-fitting patterns. This one is quite common in
> networking code for refcounters:
>
> if (atomic_cmpxchg(&cur->refcnt, 1, 0) == 1) {} This is from
> net/netfilter/nfnetlink_acct.c, but there are similar ones in other
> places.

Cute, but weird it doesn't actually decrement if not 1.

> Also, simple atomic_dec() is used pretty much everywhere for counters,
> which we don’t have a straight match in refcount_t API.

WARN_ON(refcount_dec_and_test(refs));

And seeing how I've implememented refcount_inc() in similar terms:

WARN_ON(!refcount_inc_not_zero(refs));

It might make sense to actually provide that.

2016-11-21 16:01:16

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/7] kref: Add kref_read()

On Mon, Nov 21, 2016 at 04:49:15PM +0100, Peter Zijlstra wrote:
> > Speaking of non-fitting patterns. This one is quite common in
> > networking code for refcounters:
> >
> > if (atomic_cmpxchg(&cur->refcnt, 1, 0) == 1) {} This is from
> > net/netfilter/nfnetlink_acct.c, but there are similar ones in other
> > places.
>
> Cute, but weird it doesn't actually decrement if not 1.

Hurgh.. creative refcounting that. The question is how much of that do
we want to support? It really must not decrement there.

2016-11-21 19:27:45

by Elena Reshetova

[permalink] [raw]
Subject: RE: [RFC][PATCH 2/7] kref: Add kref_read()

> On Mon, Nov 21, 2016 at 04:49:15PM +0100, Peter Zijlstra wrote:
> > > Speaking of non-fitting patterns. This one is quite common in
> > > networking code for refcounters:
> > >
> > > if (atomic_cmpxchg(&cur->refcnt, 1, 0) == 1) {} This is from
> > > net/netfilter/nfnetlink_acct.c, but there are similar ones in other
> > > places.
> >
> > Cute, but weird it doesn't actually decrement if not 1.
>
> Hurgh.. creative refcounting that. The question is how much of that do
> we want to support? It really must not decrement there.

And one more creative usage:

http://lxr.free-electrons.com/source/net/ipv4/udp.c#L1940

if (!sk || !atomic_inc_not_zero_hint(&sk->sk_refcnt, 2))
return;

I didn't even guess anyone is using atomic_inc_not_zero_hint...
But network code keeps surprising me today :)
So, yes, I guess the question is what to do with these cases really?

2016-11-21 20:12:36

by David Windsor

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/7] kref: Add kref_read()

On Mon, Nov 21, 2016 at 2:27 PM, Reshetova, Elena
<[email protected]> wrote:
>> On Mon, Nov 21, 2016 at 04:49:15PM +0100, Peter Zijlstra wrote:
>> > > Speaking of non-fitting patterns. This one is quite common in
>> > > networking code for refcounters:
>> > >
>> > > if (atomic_cmpxchg(&cur->refcnt, 1, 0) == 1) {} This is from
>> > > net/netfilter/nfnetlink_acct.c, but there are similar ones in other
>> > > places.
>> >
>> > Cute, but weird it doesn't actually decrement if not 1.
>>
>> Hurgh.. creative refcounting that. The question is how much of that do
>> we want to support? It really must not decrement there.
>
> And one more creative usage:
>
> http://lxr.free-electrons.com/source/net/ipv4/udp.c#L1940
>
> if (!sk || !atomic_inc_not_zero_hint(&sk->sk_refcnt, 2))
> return;
>
> I didn't even guess anyone is using atomic_inc_not_zero_hint...
> But network code keeps surprising me today :)
> So, yes, I guess the question is what to do with these cases really?

Many of the calls to non-supported functions can be decomposed into
calls to supported functions. The ones that may prove interesting are
ones like atomic_cmpxchg(), in which some sort of external locking is
going to be required to achieve the same atomicity guarantees provided
by cmpxchg, like so:

mutex_lock(lock);
cnt = refcount_read(ref);
if (cnt == val1) {
refcount_set(ref, val2);
}
mutex_unlock(lock);
return cnt;

2016-11-22 10:37:39

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/7] kref: Add kref_read()

On Mon, Nov 21, 2016 at 03:12:33PM -0500, David Windsor wrote:
> On Mon, Nov 21, 2016 at 2:27 PM, Reshetova, Elena
> <[email protected]> wrote:
> >> On Mon, Nov 21, 2016 at 04:49:15PM +0100, Peter Zijlstra wrote:
> >> > > Speaking of non-fitting patterns. This one is quite common in
> >> > > networking code for refcounters:
> >> > >
> >> > > if (atomic_cmpxchg(&cur->refcnt, 1, 0) == 1) {} This is from
> >> > > net/netfilter/nfnetlink_acct.c, but there are similar ones in other
> >> > > places.
> >> >
> >> > Cute, but weird it doesn't actually decrement if not 1.
> >>
> >> Hurgh.. creative refcounting that. The question is how much of that do
> >> we want to support? It really must not decrement there.

Now, arguably the 1->0 case is special, and we can provide limited
support for that, but I'd be hesitant to provide the full cmpxchg.

We could for instance provide: refcount_dec_if_one().

> > And one more creative usage:
> >
> > http://lxr.free-electrons.com/source/net/ipv4/udp.c#L1940
> >
> > if (!sk || !atomic_inc_not_zero_hint(&sk->sk_refcnt, 2))
> > return;
> >
> > I didn't even guess anyone is using atomic_inc_not_zero_hint...
> > But network code keeps surprising me today :)
> > So, yes, I guess the question is what to do with these cases really?
>
> Many of the calls to non-supported functions can be decomposed into
> calls to supported functions.

So it really depends on what the network guys are willing to put up
with, if their primary goal is to avoid the SHARED state, we could add a
load-exclusive. But I suspect they'd not be happy with that either...

> The ones that may prove interesting are
> ones like atomic_cmpxchg(), in which some sort of external locking is
> going to be required to achieve the same atomicity guarantees provided
> by cmpxchg, like so:
>
> mutex_lock(lock);
> cnt = refcount_read(ref);
> if (cnt == val1) {
> refcount_set(ref, val2);
> }
> mutex_unlock(lock);
> return cnt;

That cannot actually work in the presence of actual atomic instructions
not serialized by that lock.

Also, the network guys will absolutely kill you if you propose something
like that.