Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1753158AbaBTTaJ (ORCPT ); Thu, 20 Feb 2014 14:30:09 -0500 Received: from mail.linuxfoundation.org ([140.211.169.12]:41828 "EHLO mail.linuxfoundation.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752354AbaBTTaC (ORCPT ); Thu, 20 Feb 2014 14:30:02 -0500 Date: Thu, 20 Feb 2014 11:31:28 -0800 From: Greg KH To: linux-kernel@vger.kernel.org, Andrew Morton , torvalds@linux-foundation.org, stable@vger.kernel.org Cc: lwn@lwn.net, Jiri Slaby Subject: Re: Linux 3.4.81 Message-ID: <20140220193128.GB9107@kroah.com> References: <20140220193119.GA9107@kroah.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20140220193119.GA9107@kroah.com> User-Agent: Mutt/1.5.22 (2013-10-16) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org diff --git a/Makefile b/Makefile index 7b6c9ec4922b..5e1e1d6e0736 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 3 PATCHLEVEL = 4 -SUBLEVEL = 80 +SUBLEVEL = 81 EXTRAVERSION = NAME = Saber-toothed Squirrel diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 0d39f2f4294a..e51fdc72cdc0 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -21,8 +21,6 @@ struct workqueue_struct *virtblk_wq; struct virtio_blk { - spinlock_t lock; - struct virtio_device *vdev; struct virtqueue *vq; @@ -69,7 +67,7 @@ static void blk_done(struct virtqueue *vq) unsigned int len; unsigned long flags; - spin_lock_irqsave(&vblk->lock, flags); + spin_lock_irqsave(vblk->disk->queue->queue_lock, flags); while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) { int error; @@ -104,7 +102,7 @@ static void blk_done(struct virtqueue *vq) } /* In case queue is stopped waiting for more buffers. */ blk_start_queue(vblk->disk->queue); - spin_unlock_irqrestore(&vblk->lock, flags); + spin_unlock_irqrestore(vblk->disk->queue->queue_lock, flags); } static bool do_req(struct request_queue *q, struct virtio_blk *vblk, @@ -438,7 +436,6 @@ static int __devinit virtblk_probe(struct virtio_device *vdev) } INIT_LIST_HEAD(&vblk->reqs); - spin_lock_init(&vblk->lock); vblk->vdev = vdev; vblk->sg_elems = sg_elems; sg_init_table(vblk->sg, vblk->sg_elems); @@ -463,7 +460,7 @@ static int __devinit virtblk_probe(struct virtio_device *vdev) goto out_mempool; } - q = vblk->disk->queue = blk_init_queue(do_virtblk_request, &vblk->lock); + q = vblk->disk->queue = blk_init_queue(do_virtblk_request, NULL); if (!q) { err = -ENOMEM; goto out_put_disk; diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c index ba60f3c8f911..38c0a4720cc8 100644 --- a/drivers/gpu/drm/i915/i915_dma.c +++ b/drivers/gpu/drm/i915/i915_dma.c @@ -1934,6 +1934,27 @@ ips_ping_for_i915_load(void) } } +static void i915_kick_out_firmware_fb(struct drm_i915_private *dev_priv) +{ + struct apertures_struct *ap; + struct pci_dev *pdev = dev_priv->dev->pdev; + bool primary; + + ap = alloc_apertures(1); + if (!ap) + return; + + ap->ranges[0].base = dev_priv->dev->agp->base; + ap->ranges[0].size = + dev_priv->mm.gtt->gtt_mappable_entries << PAGE_SHIFT; + primary = + pdev->resource[PCI_ROM_RESOURCE].flags & IORESOURCE_ROM_SHADOW; + + remove_conflicting_framebuffers(ap, "inteldrmfb", primary); + + kfree(ap); +} + /** * i915_driver_load - setup chip and create an initial config * @dev: DRM device @@ -1971,6 +1992,15 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags) goto free_priv; } + dev_priv->mm.gtt = intel_gtt_get(); + if (!dev_priv->mm.gtt) { + DRM_ERROR("Failed to initialize GTT\n"); + ret = -ENODEV; + goto put_bridge; + } + + i915_kick_out_firmware_fb(dev_priv); + pci_set_master(dev->pdev); /* overlay on gen2 is broken and can't address above 1G */ @@ -1996,13 +2026,6 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags) goto put_bridge; } - dev_priv->mm.gtt = intel_gtt_get(); - if (!dev_priv->mm.gtt) { - DRM_ERROR("Failed to initialize GTT\n"); - ret = -ENODEV; - goto out_rmmap; - } - agp_size = dev_priv->mm.gtt->gtt_mappable_entries << PAGE_SHIFT; dev_priv->mm.gtt_mapping = diff --git a/drivers/infiniband/hw/qib/qib_user_sdma.c b/drivers/infiniband/hw/qib/qib_user_sdma.c index 82442085cbe6..573b4601d5b9 100644 --- a/drivers/infiniband/hw/qib/qib_user_sdma.c +++ b/drivers/infiniband/hw/qib/qib_user_sdma.c @@ -284,8 +284,7 @@ static int qib_user_sdma_pin_pages(const struct qib_devdata *dd, int j; int ret; - ret = get_user_pages(current, current->mm, addr, - npages, 0, 1, pages, NULL); + ret = get_user_pages_fast(addr, npages, 0, pages); if (ret != npages) { int i; @@ -830,10 +829,7 @@ int qib_user_sdma_writev(struct qib_ctxtdata *rcd, while (dim) { const int mxp = 8; - down_write(¤t->mm->mmap_sem); ret = qib_user_sdma_queue_pkts(dd, pq, &list, iov, dim, mxp); - up_write(¤t->mm->mmap_sem); - if (ret <= 0) goto done_unlock; else { diff --git a/drivers/input/mouse/synaptics.c b/drivers/input/mouse/synaptics.c index a4b14a41cbf4..e2c2e1e2bd6f 100644 --- a/drivers/input/mouse/synaptics.c +++ b/drivers/input/mouse/synaptics.c @@ -40,11 +40,28 @@ * Note that newer firmware allows querying device for maximum useable * coordinates. */ +#define XMIN 0 +#define XMAX 6143 +#define YMIN 0 +#define YMAX 6143 #define XMIN_NOMINAL 1472 #define XMAX_NOMINAL 5472 #define YMIN_NOMINAL 1408 #define YMAX_NOMINAL 4448 +/* Size in bits of absolute position values reported by the hardware */ +#define ABS_POS_BITS 13 + +/* + * Any position values from the hardware above the following limits are + * treated as "wrapped around negative" values that have been truncated to + * the 13-bit reporting range of the hardware. These are just reasonable + * guesses and can be adjusted if hardware is found that operates outside + * of these parameters. + */ +#define X_MAX_POSITIVE (((1 << ABS_POS_BITS) + XMAX) / 2) +#define Y_MAX_POSITIVE (((1 << ABS_POS_BITS) + YMAX) / 2) + /* * Synaptics touchpads report the y coordinate from bottom to top, which is * opposite from what userspace expects. @@ -555,6 +572,12 @@ static int synaptics_parse_hw_state(const unsigned char buf[], hw->right = (buf[0] & 0x02) ? 1 : 0; } + /* Convert wrap-around values to negative */ + if (hw->x > X_MAX_POSITIVE) + hw->x -= 1 << ABS_POS_BITS; + if (hw->y > Y_MAX_POSITIVE) + hw->y -= 1 << ABS_POS_BITS; + return 0; } diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 10f122a3a856..da4dc255bc54 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -185,8 +185,12 @@ config MD_FAULTY In unsure, say N. +config BLK_DEV_DM_BUILTIN + boolean + config BLK_DEV_DM tristate "Device mapper support" + select BLK_DEV_DM_BUILTIN ---help--- Device-mapper is a low level volume manager. It works by allowing people to specify mappings for ranges of logical sectors. Various diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 8b2e0dffe82e..4e87c544744d 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -28,6 +28,7 @@ obj-$(CONFIG_MD_MULTIPATH) += multipath.o obj-$(CONFIG_MD_FAULTY) += faulty.o obj-$(CONFIG_BLK_DEV_MD) += md-mod.o obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o +obj-$(CONFIG_BLK_DEV_DM_BUILTIN) += dm-builtin.o obj-$(CONFIG_DM_BUFIO) += dm-bufio.o obj-$(CONFIG_DM_CRYPT) += dm-crypt.o obj-$(CONFIG_DM_DELAY) += dm-delay.o diff --git a/drivers/md/dm-builtin.c b/drivers/md/dm-builtin.c new file mode 100644 index 000000000000..797daec490ed --- /dev/null +++ b/drivers/md/dm-builtin.c @@ -0,0 +1,50 @@ +#include "dm.h" + +#include + +/* + * The kobject release method must not be placed in the module itself, + * otherwise we are subject to module unload races. + * + * The release method is called when the last reference to the kobject is + * dropped. It may be called by any other kernel code that drops the last + * reference. + * + * The release method suffers from module unload race. We may prevent the + * module from being unloaded at the start of the release method (using + * increased module reference count or synchronizing against the release + * method), however there is no way to prevent the module from being + * unloaded at the end of the release method. + * + * If this code were placed in the dm module, the following race may + * happen: + * 1. Some other process takes a reference to dm kobject + * 2. The user issues ioctl function to unload the dm device + * 3. dm_sysfs_exit calls kobject_put, however the object is not released + * because of the other reference taken at step 1 + * 4. dm_sysfs_exit waits on the completion + * 5. The other process that took the reference in step 1 drops it, + * dm_kobject_release is called from this process + * 6. dm_kobject_release calls complete() + * 7. a reschedule happens before dm_kobject_release returns + * 8. dm_sysfs_exit continues, the dm device is unloaded, module reference + * count is decremented + * 9. The user unloads the dm module + * 10. The other process that was rescheduled in step 7 continues to run, + * it is now executing code in unloaded module, so it crashes + * + * Note that if the process that takes the foreign reference to dm kobject + * has a low priority and the system is sufficiently loaded with + * higher-priority processes that prevent the low-priority process from + * being scheduled long enough, this bug may really happen. + * + * In order to fix this module unload race, we place the release method + * into a helper code that is compiled directly into the kernel. + */ + +void dm_kobject_release(struct kobject *kobj) +{ + complete(dm_get_completion_from_kobject(kobj)); +} + +EXPORT_SYMBOL(dm_kobject_release); diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c index e0cc5d6a9e46..c62c5ab6aed5 100644 --- a/drivers/md/dm-sysfs.c +++ b/drivers/md/dm-sysfs.c @@ -79,11 +79,6 @@ static const struct sysfs_ops dm_sysfs_ops = { .show = dm_attr_show, }; -static void dm_kobject_release(struct kobject *kobj) -{ - complete(dm_get_completion_from_kobject(kobj)); -} - /* * dm kobject is embedded in mapped_device structure * no need to define release function here diff --git a/drivers/md/dm.c b/drivers/md/dm.c index d26fddf7c1fb..0cf8c519d07e 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -191,11 +191,8 @@ struct mapped_device { /* forced geometry settings */ struct hd_geometry geometry; - /* sysfs handle */ - struct kobject kobj; - - /* wait until the kobject is released */ - struct completion kobj_completion; + /* kobject and completion */ + struct dm_kobject_holder kobj_holder; /* zero-length flush that will be cloned and submitted to targets */ struct bio flush_bio; @@ -1894,7 +1891,7 @@ static struct mapped_device *alloc_dev(int minor) init_waitqueue_head(&md->wait); INIT_WORK(&md->work, dm_wq_work); init_waitqueue_head(&md->eventq); - init_completion(&md->kobj_completion); + init_completion(&md->kobj_holder.completion); md->disk->major = _major; md->disk->first_minor = minor; @@ -2686,20 +2683,14 @@ struct gendisk *dm_disk(struct mapped_device *md) struct kobject *dm_kobject(struct mapped_device *md) { - return &md->kobj; + return &md->kobj_holder.kobj; } -/* - * struct mapped_device should not be exported outside of dm.c - * so use this check to verify that kobj is part of md structure - */ struct mapped_device *dm_get_from_kobject(struct kobject *kobj) { struct mapped_device *md; - md = container_of(kobj, struct mapped_device, kobj); - if (&md->kobj != kobj) - return NULL; + md = container_of(kobj, struct mapped_device, kobj_holder.kobj); if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) @@ -2709,13 +2700,6 @@ struct mapped_device *dm_get_from_kobject(struct kobject *kobj) return md; } -struct completion *dm_get_completion_from_kobject(struct kobject *kobj) -{ - struct mapped_device *md = container_of(kobj, struct mapped_device, kobj); - - return &md->kobj_completion; -} - int dm_suspended_md(struct mapped_device *md) { return test_bit(DMF_SUSPENDED, &md->flags); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 1174e9654882..9db80c92096a 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -16,6 +16,7 @@ #include #include #include +#include /* * Suspend feature flags @@ -120,11 +121,25 @@ void dm_interface_exit(void); /* * sysfs interface */ +struct dm_kobject_holder { + struct kobject kobj; + struct completion completion; +}; + +static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj) +{ + return &container_of(kobj, struct dm_kobject_holder, kobj)->completion; +} + int dm_sysfs_init(struct mapped_device *md); void dm_sysfs_exit(struct mapped_device *md); struct kobject *dm_kobject(struct mapped_device *md); struct mapped_device *dm_get_from_kobject(struct kobject *kobj); -struct completion *dm_get_completion_from_kobject(struct kobject *kobj); + +/* + * The kobject helper + */ +void dm_kobject_release(struct kobject *kobj); /* * Targets for linear and striped mappings diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c index f286955331a2..a251efd783de 100644 --- a/drivers/target/target_core_file.c +++ b/drivers/target/target_core_file.c @@ -133,21 +133,24 @@ static struct se_device *fd_create_virtdevice( ret = PTR_ERR(dev_p); goto fail; } -#if 0 - if (di->no_create_file) - flags = O_RDWR | O_LARGEFILE; - else - flags = O_RDWR | O_CREAT | O_LARGEFILE; -#else - flags = O_RDWR | O_CREAT | O_LARGEFILE; -#endif -/* flags |= O_DIRECT; */ /* - * If fd_buffered_io=1 has not been set explicitly (the default), - * use O_SYNC to force FILEIO writes to disk. + * Use O_DSYNC by default instead of O_SYNC to forgo syncing + * of pure timestamp updates. + */ + flags = O_RDWR | O_CREAT | O_LARGEFILE | O_DSYNC; + /* + * Optionally allow fd_buffered_io=1 to be enabled for people + * who want use the fs buffer cache as an WriteCache mechanism. + * + * This means that in event of a hard failure, there is a risk + * of silent data-loss if the SCSI client has *not* performed a + * forced unit access (FUA) write, or issued SYNCHRONIZE_CACHE + * to write-out the entire device cache. */ - if (!(fd_dev->fbd_flags & FDBD_USE_BUFFERED_IO)) - flags |= O_SYNC; + if (fd_dev->fbd_flags & FDBD_HAS_BUFFERED_IO_WCE) { + pr_debug("FILEIO: Disabling O_DSYNC, using buffered FILEIO\n"); + flags &= ~O_DSYNC; + } file = filp_open(dev_p, flags, 0600); if (IS_ERR(file)) { @@ -215,6 +218,12 @@ static struct se_device *fd_create_virtdevice( if (!dev) goto fail; + if (fd_dev->fbd_flags & FDBD_HAS_BUFFERED_IO_WCE) { + pr_debug("FILEIO: Forcing setting of emulate_write_cache=1" + " with FDBD_HAS_BUFFERED_IO_WCE\n"); + dev->se_sub_dev->se_dev_attrib.emulate_write_cache = 1; + } + fd_dev->fd_dev_id = fd_host->fd_host_dev_id_count++; fd_dev->fd_queue_depth = dev->queue_depth; @@ -399,26 +408,6 @@ static void fd_emulate_sync_cache(struct se_task *task) transport_complete_sync_cache(cmd, ret == 0); } -/* - * WRITE Force Unit Access (FUA) emulation on a per struct se_task - * LBA range basis.. - */ -static void fd_emulate_write_fua(struct se_cmd *cmd, struct se_task *task) -{ - struct se_device *dev = cmd->se_dev; - struct fd_dev *fd_dev = dev->dev_ptr; - loff_t start = task->task_lba * dev->se_sub_dev->se_dev_attrib.block_size; - loff_t end = start + task->task_size; - int ret; - - pr_debug("FILEIO: FUA WRITE LBA: %llu, bytes: %u\n", - task->task_lba, task->task_size); - - ret = vfs_fsync_range(fd_dev->fd_file, start, end, 1); - if (ret != 0) - pr_err("FILEIO: vfs_fsync_range() failed: %d\n", ret); -} - static int fd_do_task(struct se_task *task) { struct se_cmd *cmd = task->task_se_cmd; @@ -433,19 +422,21 @@ static int fd_do_task(struct se_task *task) ret = fd_do_readv(task); } else { ret = fd_do_writev(task); - + /* + * Perform implict vfs_fsync_range() for fd_do_writev() ops + * for SCSI WRITEs with Forced Unit Access (FUA) set. + * Allow this to happen independent of WCE=0 setting. + */ if (ret > 0 && - dev->se_sub_dev->se_dev_attrib.emulate_write_cache > 0 && dev->se_sub_dev->se_dev_attrib.emulate_fua_write > 0 && (cmd->se_cmd_flags & SCF_FUA)) { - /* - * We might need to be a bit smarter here - * and return some sense data to let the initiator - * know the FUA WRITE cache sync failed..? - */ - fd_emulate_write_fua(cmd, task); - } + struct fd_dev *fd_dev = dev->dev_ptr; + loff_t start = task->task_lba * + dev->se_sub_dev->se_dev_attrib.block_size; + loff_t end = start + task->task_size; + vfs_fsync_range(fd_dev->fd_file, start, end, 1); + } } if (ret < 0) { @@ -544,7 +535,7 @@ static ssize_t fd_set_configfs_dev_params( pr_debug("FILEIO: Using buffered I/O" " operations for struct fd_dev\n"); - fd_dev->fbd_flags |= FDBD_USE_BUFFERED_IO; + fd_dev->fbd_flags |= FDBD_HAS_BUFFERED_IO_WCE; break; default: break; @@ -579,8 +570,8 @@ static ssize_t fd_show_configfs_dev_params( bl = sprintf(b + bl, "TCM FILEIO ID: %u", fd_dev->fd_dev_id); bl += sprintf(b + bl, " File: %s Size: %llu Mode: %s\n", fd_dev->fd_dev_name, fd_dev->fd_dev_size, - (fd_dev->fbd_flags & FDBD_USE_BUFFERED_IO) ? - "Buffered" : "Synchronous"); + (fd_dev->fbd_flags & FDBD_HAS_BUFFERED_IO_WCE) ? + "Buffered-WCE" : "O_DSYNC"); return bl; } diff --git a/drivers/target/target_core_file.h b/drivers/target/target_core_file.h index 59e6e73106c2..6b1b6a979a10 100644 --- a/drivers/target/target_core_file.h +++ b/drivers/target/target_core_file.h @@ -18,7 +18,7 @@ struct fd_request { #define FBDF_HAS_PATH 0x01 #define FBDF_HAS_SIZE 0x02 -#define FDBD_USE_BUFFERED_IO 0x04 +#define FDBD_HAS_BUFFERED_IO_WCE 0x04 struct fd_dev { u32 fbd_flags; diff --git a/fs/buffer.c b/fs/buffer.c index 18669e92b676..9bf31ac982e1 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -613,14 +613,16 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); static void __set_page_dirty(struct page *page, struct address_space *mapping, int warn) { - spin_lock_irq(&mapping->tree_lock); + unsigned long flags; + + spin_lock_irqsave(&mapping->tree_lock, flags); if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); account_page_dirtied(page, mapping); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } - spin_unlock_irq(&mapping->tree_lock); + spin_unlock_irqrestore(&mapping->tree_lock, flags); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index b1c3f2ade2b0..75c4f36bced8 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -778,6 +778,8 @@ got: ext4_itable_unused_set(sb, gdp, (EXT4_INODES_PER_GROUP(sb) - ino)); up_read(&grp->alloc_sem); + } else { + ext4_lock_group(sb, group); } ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1); if (S_ISDIR(mode)) { @@ -790,8 +792,8 @@ got: } if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); - ext4_unlock_group(sb, group); } + ext4_unlock_group(sb, group); BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index c07462320f6b..da8fd94fcd93 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1751,12 +1751,12 @@ int __init nfs_init_writepagecache(void) nfs_wdata_mempool = mempool_create_slab_pool(MIN_POOL_WRITE, nfs_wdata_cachep); if (nfs_wdata_mempool == NULL) - return -ENOMEM; + goto out_destroy_write_cache; nfs_commit_mempool = mempool_create_slab_pool(MIN_POOL_COMMIT, nfs_wdata_cachep); if (nfs_commit_mempool == NULL) - return -ENOMEM; + goto out_destroy_write_mempool; /* * NFS congestion size, scale with available memory. @@ -1779,6 +1779,12 @@ int __init nfs_init_writepagecache(void) nfs_congestion_kb = 256*1024; return 0; + +out_destroy_write_mempool: + mempool_destroy(nfs_wdata_mempool); +out_destroy_write_cache: + kmem_cache_destroy(nfs_wdata_cachep); + return -ENOMEM; } void nfs_destroy_writepagecache(void) diff --git a/include/linux/sched.h b/include/linux/sched.h index 1c2470de8052..8cd5cb80223c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -144,6 +144,7 @@ extern unsigned long this_cpu_load(void); extern void calc_global_load(unsigned long ticks); +extern void update_cpu_load_nohz(void); extern unsigned long get_parent_ip(unsigned long addr); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index eef311a58a64..11e22c068e8b 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -6,7 +6,7 @@ * * Copyright (C) 1998,2001-2005 Pavel Machek * Copyright (C) 2006 Rafael J. Wysocki - * Copyright (C) 2010 Bojan Smojver + * Copyright (C) 2010-2012 Bojan Smojver * * This file is released under the GPLv2. * @@ -282,14 +282,17 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain) return -ENOSPC; if (bio_chain) { - src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); + src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN | + __GFP_NORETRY); if (src) { copy_page(src, buf); } else { ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */ if (ret) return ret; - src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); + src = (void *)__get_free_page(__GFP_WAIT | + __GFP_NOWARN | + __GFP_NORETRY); if (src) { copy_page(src, buf); } else { @@ -367,12 +370,17 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, clear_page(handle->cur); handle->cur_swap = offset; handle->k = 0; - } - if (bio_chain && low_free_pages() <= handle->reqd_free_pages) { - error = hib_wait_on_bio_chain(bio_chain); - if (error) - goto out; - handle->reqd_free_pages = reqd_free_pages(); + + if (bio_chain && low_free_pages() <= handle->reqd_free_pages) { + error = hib_wait_on_bio_chain(bio_chain); + if (error) + goto out; + /* + * Recalculate the number of required free pages, to + * make sure we never take more than half. + */ + handle->reqd_free_pages = reqd_free_pages(); + } } out: return error; @@ -419,8 +427,9 @@ static int swap_writer_finish(struct swap_map_handle *handle, /* Maximum number of threads for compression/decompression. */ #define LZO_THREADS 3 -/* Maximum number of pages for read buffering. */ -#define LZO_READ_PAGES (MAP_PAGE_ENTRIES * 8) +/* Minimum/maximum number of pages for read buffering. */ +#define LZO_MIN_RD_PAGES 1024 +#define LZO_MAX_RD_PAGES 8192 /** @@ -631,12 +640,6 @@ static int save_image_lzo(struct swap_map_handle *handle, } /* - * Adjust number of free pages after all allocations have been done. - * We don't want to run out of pages when writing. - */ - handle->reqd_free_pages = reqd_free_pages(); - - /* * Start the CRC32 thread. */ init_waitqueue_head(&crc->go); @@ -657,6 +660,12 @@ static int save_image_lzo(struct swap_map_handle *handle, goto out_clean; } + /* + * Adjust the number of required free pages after all allocations have + * been done. We don't want to run out of pages when writing. + */ + handle->reqd_free_pages = reqd_free_pages(); + printk(KERN_INFO "PM: Using %u thread(s) for compression.\n" "PM: Compressing and saving image data (%u pages) ... ", @@ -1067,7 +1076,7 @@ static int load_image_lzo(struct swap_map_handle *handle, unsigned i, thr, run_threads, nr_threads; unsigned ring = 0, pg = 0, ring_size = 0, have = 0, want, need, asked = 0; - unsigned long read_pages; + unsigned long read_pages = 0; unsigned char **page = NULL; struct dec_data *data = NULL; struct crc_data *crc = NULL; @@ -1079,7 +1088,7 @@ static int load_image_lzo(struct swap_map_handle *handle, nr_threads = num_online_cpus() - 1; nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); - page = vmalloc(sizeof(*page) * LZO_READ_PAGES); + page = vmalloc(sizeof(*page) * LZO_MAX_RD_PAGES); if (!page) { printk(KERN_ERR "PM: Failed to allocate LZO page\n"); ret = -ENOMEM; @@ -1144,15 +1153,22 @@ static int load_image_lzo(struct swap_map_handle *handle, } /* - * Adjust number of pages for read buffering, in case we are short. + * Set the number of pages for read buffering. + * This is complete guesswork, because we'll only know the real + * picture once prepare_image() is called, which is much later on + * during the image load phase. We'll assume the worst case and + * say that none of the image pages are from high memory. */ - read_pages = (nr_free_pages() - snapshot_get_image_size()) >> 1; - read_pages = clamp_val(read_pages, LZO_CMP_PAGES, LZO_READ_PAGES); + if (low_free_pages() > snapshot_get_image_size()) + read_pages = (low_free_pages() - snapshot_get_image_size()) / 2; + read_pages = clamp_val(read_pages, LZO_MIN_RD_PAGES, LZO_MAX_RD_PAGES); for (i = 0; i < read_pages; i++) { page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? __GFP_WAIT | __GFP_HIGH : - __GFP_WAIT); + __GFP_WAIT | __GFP_NOWARN | + __GFP_NORETRY); + if (!page[i]) { if (i < LZO_CMP_PAGES) { ring_size = i; diff --git a/kernel/printk.c b/kernel/printk.c index e95c66223d33..e131c464f332 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1172,7 +1172,6 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self, switch (action) { case CPU_ONLINE: case CPU_DEAD: - case CPU_DYING: case CPU_DOWN_FAILED: case CPU_UP_CANCELED: console_lock(); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7522816cd7f6..94f132775d05 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -692,8 +692,6 @@ int tg_nop(struct task_group *tg, void *data) } #endif -void update_cpu_load(struct rq *this_rq); - static void set_load_weight(struct task_struct *p) { int prio = p->static_prio - MAX_RT_PRIO; @@ -2620,22 +2618,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) * scheduler tick (TICK_NSEC). With tickless idle this will not be called * every tick. We fix it up based on jiffies. */ -void update_cpu_load(struct rq *this_rq) +static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, + unsigned long pending_updates) { - unsigned long this_load = this_rq->load.weight; - unsigned long curr_jiffies = jiffies; - unsigned long pending_updates; int i, scale; this_rq->nr_load_updates++; - /* Avoid repeated calls on same jiffy, when moving in and out of idle */ - if (curr_jiffies == this_rq->last_load_update_tick) - return; - - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - this_rq->last_load_update_tick = curr_jiffies; - /* Update our load: */ this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { @@ -2660,9 +2649,78 @@ void update_cpu_load(struct rq *this_rq) sched_avg_update(this_rq); } +#ifdef CONFIG_NO_HZ +/* + * There is no sane way to deal with nohz on smp when using jiffies because the + * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading + * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. + * + * Therefore we cannot use the delta approach from the regular tick since that + * would seriously skew the load calculation. However we'll make do for those + * updates happening while idle (nohz_idle_balance) or coming out of idle + * (tick_nohz_idle_exit). + * + * This means we might still be one tick off for nohz periods. + */ + +/* + * Called from nohz_idle_balance() to update the load ratings before doing the + * idle balance. + */ +void update_idle_cpu_load(struct rq *this_rq) +{ + unsigned long curr_jiffies = ACCESS_ONCE(jiffies); + unsigned long load = this_rq->load.weight; + unsigned long pending_updates; + + /* + * bail if there's load or we're actually up-to-date. + */ + if (load || curr_jiffies == this_rq->last_load_update_tick) + return; + + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + this_rq->last_load_update_tick = curr_jiffies; + + __update_cpu_load(this_rq, load, pending_updates); +} + +/* + * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. + */ +void update_cpu_load_nohz(void) +{ + struct rq *this_rq = this_rq(); + unsigned long curr_jiffies = ACCESS_ONCE(jiffies); + unsigned long pending_updates; + + if (curr_jiffies == this_rq->last_load_update_tick) + return; + + raw_spin_lock(&this_rq->lock); + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + if (pending_updates) { + this_rq->last_load_update_tick = curr_jiffies; + /* + * We were idle, this means load 0, the current load might be + * !0 due to remote wakeups and the sort. + */ + __update_cpu_load(this_rq, 0, pending_updates); + } + raw_spin_unlock(&this_rq->lock); +} +#endif /* CONFIG_NO_HZ */ + +/* + * Called from scheduler_tick() + */ static void update_cpu_load_active(struct rq *this_rq) { - update_cpu_load(this_rq); + /* + * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). + */ + this_rq->last_load_update_tick = jiffies; + __update_cpu_load(this_rq, this_rq->load.weight, 1); calc_load_account_active(this_rq); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 93be350c9b63..dd33c9fd8009 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5042,7 +5042,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) raw_spin_lock_irq(&this_rq->lock); update_rq_clock(this_rq); - update_cpu_load(this_rq); + update_idle_cpu_load(this_rq); raw_spin_unlock_irq(&this_rq->lock); rebalance_domains(balance_cpu, CPU_IDLE); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a70d8908a6d3..4a5e7398d77b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -873,7 +873,7 @@ extern void resched_cpu(int cpu); extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); -extern void update_cpu_load(struct rq *this_rq); +extern void update_idle_cpu_load(struct rq *this_rq); #ifdef CONFIG_CGROUP_CPUACCT #include diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a1e079536a71..638dadf6295f 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -582,6 +582,7 @@ void tick_nohz_idle_exit(void) /* Update jiffies first */ select_nohz_load_balancer(0); tick_do_update_jiffies64(now); + update_cpu_load_nohz(); #ifndef CONFIG_VIRT_CPU_ACCOUNTING /* diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f0e76e93aee5..5efdddf04b15 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -222,6 +222,29 @@ static void update_global_ops(void) global_ops.func = func; } +static void ftrace_sync(struct work_struct *work) +{ + /* + * This function is just a stub to implement a hard force + * of synchronize_sched(). This requires synchronizing + * tasks even in userspace and idle. + * + * Yes, function tracing is rude. + */ +} + +static void ftrace_sync_ipi(void *data) +{ + /* Probably not needed, but do it anyway */ + smp_rmb(); +} + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static void update_function_graph_func(void); +#else +static inline void update_function_graph_func(void) { } +#endif + static void update_ftrace_function(void) { ftrace_func_t func; @@ -240,6 +263,8 @@ static void update_ftrace_function(void) else func = ftrace_ops_list_func; + update_function_graph_func(); + #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST ftrace_trace_function = func; #else @@ -359,16 +384,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) } else if (ops->flags & FTRACE_OPS_FL_CONTROL) { ret = remove_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); - if (!ret) { - /* - * The ftrace_ops is now removed from the list, - * so there'll be no new users. We must ensure - * all current users are done before we free - * the control data. - */ - synchronize_sched(); - control_ops_free(ops); - } } else ret = remove_ftrace_ops(&ftrace_ops_list, ops); @@ -378,13 +393,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) if (ftrace_enabled) update_ftrace_function(); - /* - * Dynamic ops may be freed, we must make sure that all - * callers are done before leaving this function. - */ - if (ops->flags & FTRACE_OPS_FL_DYNAMIC) - synchronize_sched(); - return 0; } @@ -2008,10 +2016,41 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) command |= FTRACE_UPDATE_TRACE_FUNC; } - if (!command || !ftrace_enabled) + if (!command || !ftrace_enabled) { + /* + * If these are control ops, they still need their + * per_cpu field freed. Since, function tracing is + * not currently active, we can just free them + * without synchronizing all CPUs. + */ + if (ops->flags & FTRACE_OPS_FL_CONTROL) + control_ops_free(ops); return 0; + } ftrace_run_update_code(command); + + /* + * Dynamic ops may be freed, we must make sure that all + * callers are done before leaving this function. + * The same goes for freeing the per_cpu data of the control + * ops. + * + * Again, normal synchronize_sched() is not good enough. + * We need to do a hard force of sched synchronization. + * This is because we use preempt_disable() to do RCU, but + * the function tracers can be called where RCU is not watching + * (like before user_exit()). We can not rely on the RCU + * infrastructure to do the synchronization, thus we must do it + * ourselves. + */ + if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) { + schedule_on_each_cpu(ftrace_sync); + + if (ops->flags & FTRACE_OPS_FL_CONTROL) + control_ops_free(ops); + } + return 0; } @@ -4404,6 +4443,7 @@ int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) trace_func_graph_ret_t ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub; +static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub; /* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) @@ -4544,6 +4584,30 @@ static struct ftrace_ops fgraph_ops __read_mostly = { .flags = FTRACE_OPS_FL_GLOBAL, }; +static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace) +{ + if (!ftrace_ops_test(&global_ops, trace->func)) + return 0; + return __ftrace_graph_entry(trace); +} + +/* + * The function graph tracer should only trace the functions defined + * by set_ftrace_filter and set_ftrace_notrace. If another function + * tracer ops is registered, the graph tracer requires testing the + * function against the global ops, and not just trace any function + * that any ftrace_ops registered. + */ +static void update_function_graph_func(void) +{ + if (ftrace_ops_list == &ftrace_list_end || + (ftrace_ops_list == &global_ops && + global_ops.next == &ftrace_list_end)) + ftrace_graph_entry = __ftrace_graph_entry; + else + ftrace_graph_entry = ftrace_graph_entry_test; +} + int register_ftrace_graph(trace_func_graph_ret_t retfunc, trace_func_graph_ent_t entryfunc) { @@ -4568,7 +4632,16 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, } ftrace_graph_return = retfunc; - ftrace_graph_entry = entryfunc; + + /* + * Update the indirect function to the entryfunc, and the + * function that gets called to the entry_test first. Then + * call the update fgraph entry function to determine if + * the entryfunc should be called directly or not. + */ + __ftrace_graph_entry = entryfunc; + ftrace_graph_entry = ftrace_graph_entry_test; + update_function_graph_func(); ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET); @@ -4587,6 +4660,7 @@ void unregister_ftrace_graph(void) ftrace_graph_active--; ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; ftrace_graph_entry = ftrace_graph_entry_stub; + __ftrace_graph_entry = ftrace_graph_entry_stub; ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET); unregister_pm_notifier(&ftrace_suspend_notifier); unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); diff --git a/lib/Makefile b/lib/Makefile index 18515f0267c4..801c567b9e4b 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -41,6 +41,7 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o +GCOV_PROFILE_hweight.o := n CFLAGS_hweight.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS)) obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 73e2c45acfdb..cd0aab7e6aa7 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -926,7 +926,8 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr, * %pK cannot be used in IRQ context because its test * for CAP_SYSLOG would be meaningless. */ - if (in_irq() || in_serving_softirq() || in_nmi()) { + if (kptr_restrict && (in_irq() || in_serving_softirq() || + in_nmi())) { if (spec.field_width == -1) spec.field_width = 2 * sizeof(void *); return string(buf, end, "pK-error", spec); diff --git a/mm/internal.h b/mm/internal.h index 2189af491783..0c26b5e6d41d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -309,3 +309,5 @@ extern u64 hwpoison_filter_flags_mask; extern u64 hwpoison_filter_flags_value; extern u64 hwpoison_filter_memcg; extern u32 hwpoison_filter_enable; + +extern void set_pageblock_order(void); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3b15e2a147a2..2a13b7997ac6 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1993,11 +1993,12 @@ int __set_page_dirty_nobuffers(struct page *page) if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page); struct address_space *mapping2; + unsigned long flags; if (!mapping) return 1; - spin_lock_irq(&mapping->tree_lock); + spin_lock_irqsave(&mapping->tree_lock, flags); mapping2 = page_mapping(page); if (mapping2) { /* Race with truncate? */ BUG_ON(mapping2 != mapping); @@ -2006,7 +2007,7 @@ int __set_page_dirty_nobuffers(struct page *page) radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } - spin_unlock_irq(&mapping->tree_lock); + spin_unlock_irqrestore(&mapping->tree_lock, flags); if (mapping->host) { /* !PageAnon && !swapper_space */ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 508822e1082a..39d530a425b5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4254,25 +4254,24 @@ static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE -/* Return a sensible default order for the pageblock size. */ -static inline int pageblock_default_order(void) -{ - if (HPAGE_SHIFT > PAGE_SHIFT) - return HUGETLB_PAGE_ORDER; - - return MAX_ORDER-1; -} - /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ -static inline void __init set_pageblock_order(unsigned int order) +void __init set_pageblock_order(void) { + unsigned int order; + /* Check that pageblock_nr_pages has not already been setup */ if (pageblock_order) return; + if (HPAGE_SHIFT > PAGE_SHIFT) + order = HUGETLB_PAGE_ORDER; + else + order = MAX_ORDER - 1; + /* * Assume the largest contiguous order of interest is a huge page. - * This value may be variable depending on boot parameters on IA64 + * This value may be variable depending on boot parameters on IA64 and + * powerpc. */ pageblock_order = order; } @@ -4280,15 +4279,13 @@ static inline void __init set_pageblock_order(unsigned int order) /* * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() - * and pageblock_default_order() are unused as pageblock_order is set - * at compile-time. See include/linux/pageblock-flags.h for the values of - * pageblock_order based on the kernel config + * is unused as pageblock_order is set at compile-time. See + * include/linux/pageblock-flags.h for the values of pageblock_order based on + * the kernel config */ -static inline int pageblock_default_order(unsigned int order) +void __init set_pageblock_order(void) { - return MAX_ORDER-1; } -#define set_pageblock_order(x) do {} while (0) #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ @@ -4376,7 +4373,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, if (!size) continue; - set_pageblock_order(pageblock_default_order()); + set_pageblock_order(); setup_usemap(pgdat, zone, zone_start_pfn, size); ret = init_currently_empty_zone(zone, zone_start_pfn, size, MEMMAP_EARLY); diff --git a/mm/sparse.c b/mm/sparse.c index 290dba25a7ed..42935b5545a0 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -486,6 +486,9 @@ void __init sparse_init(void) struct page **map_map; #endif + /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */ + set_pageblock_order(); + /* * map is using big page (aka 2M in x86 64 bit) * usemap is less one page (aka 24 bytes) diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index 185f849a26f6..72b20b1089df 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -1229,6 +1229,10 @@ static int security_context_to_sid_core(const char *scontext, u32 scontext_len, struct context context; int rc = 0; + /* An empty security context is never valid. */ + if (!scontext_len) + return -EINVAL; + if (!ss_initialized) { int i; diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index 9f614b4e365f..272407c00ede 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -318,6 +318,7 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt, */ hlist_for_each_entry(ei, n, &rt->map[ue->gsi], link) if (ei->type == KVM_IRQ_ROUTING_MSI || + ue->type == KVM_IRQ_ROUTING_MSI || ue->u.irqchip.irqchip == ei->irqchip.irqchip) return r; -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/