2005-03-02 15:53:17

by David Howells

[permalink] [raw]
Subject: Improving mmap() support for !MMU further


Hi Linus,

I'd like to improve mmap() support on !MMU still further by overloading struct
backing_dev_info::memory_backed to hold flags describing what the backing
device is capable of with respect to direct memory access.

(*) If bdi->memory_backed is 0, then the backing device is not accessible as
memory.

(*) If the bdi->memory_backed is non-zero, then it's a bit mask of the
following:

#define BDI_NO_WRITEBACK 0x00000001
#define BDI_MEMORY_BACKED 0x00000001
#define BDI_CAN_MAP_DIRECT 0x00000002
#define BDI_CAN_MAP_COPY 0x00000004
#define BDI_CAN_READ_MAP VM_MAYREAD
#define BDI_CAN_WRITE_MAP VM_MAYWRITE
#define BDI_CAN_EXEC_MAP VM_MAYEXEC

(*) BDI_NO_WRITEBACK

Don't attempt to write back dirty pages.

(*) BDI_MEMORY_BACKED

The device is a memory or quasi-memory device.

(*) BDI_CAN_MAP_DIRECT

The device can be mapped directly. This means MAP_SHARED can be supported
within the bounds of the specific mapping protection types.

(*) BDI_CAN_MAP_COPY

A copy into memory can be taken of all or part of the device using the
file->f_op->read() method, and that can be passed to userspace as a
MAP_PRIVATE mapping.

I/O devices, such as framebuffers, probably wouldn't set this as it
doesn't make sense to take a copy. Storage devices (such as flash) and
filesystems (such as RAMFS, CRAMFS or ROMFS) might, because it does make
sense to take a copy for private writable mappings such as are made for
executables.

(*) BDI_CAN_READ_MAP
(*) BDI_CAN_WRITE_MAP
(*) BDI_CAN_EXEC_MAP

Describe the capabilities of the backing device if mapped directly. RAMFS
and RAMDISK could set all three, but CRAMFS, ROMFS and flash filesystems
would probably not indicate writability and I/O devices such as frame
buffers would not indicate executability.


By using such capability flags, no-MMU mmap() (and even MMU mmap()) could:

(1) Reject unsupportable shared mappings early on.

(2) Obviate the need for ROMFS hooks by determining what mappings can be
mapped directly out of ROM and what must fall back to copies in RAM.

I've attached a sample mm/nommu.c mmap() implementation using this capability
information for reference. Note the defaults for a chardev are direct mappings
only and for a file are copied mappings only (copied mappings can be shared
under some circumstances).

David

---
unsigned long do_mmap_pgoff(struct file *file,
unsigned long addr,
unsigned long len,
unsigned long prot,
unsigned long flags,
unsigned long pgoff)
{
struct vm_list_struct *vml = NULL;
struct vm_area_struct *vma = NULL;
struct rb_node *rb;
unsigned long membacked, vm_flags;
void *result;
int ret;

/* do the simple checks first */
if (flags & MAP_FIXED || addr) {
printk(KERN_DEBUG "%d: Can't do fixed-address/overlay mmap of RAM\n",
current->pid);
return -EINVAL;
}

if ((flags & MAP_TYPE) != MAP_PRIVATE &&
(flags & MAP_TYPE) != MAP_SHARED)
return -EINVAL;

if (PAGE_ALIGN(len) == 0)
return addr;

if (len > TASK_SIZE)
return -EINVAL;

/* offset overflow? */
if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
return -EINVAL;

if (file) {
/* validate file mapping requests */
struct address_space *mapping;

/* files must support mmap */
if (!file->f_op || !file->f_op->mmap)
return -ENODEV;

/* work out if what we've got could possibly be shared
* - we support chardevs that provide their own "memory"
* - we support files/blockdevs that are memory backed
*/
mapping = file->f_mapping;
if (!mapping)
mapping = file->f_dentry->d_inode->i_mapping;

membacked = 0;
if (mapping && mapping->backing_dev_info)
membacked = mapping->backing_dev_info->memory_backed;
else if (S_ISCHR(file->f_dentry->d_inode->i_mode))
membacked = BDI_MEMORY_BACKED | BDI_CAN_MAP_DIRECT;

if (!membacked) {
if (S_ISREG(file->f_dentry->d_inode->i_mode) ||
S_ISBLK(file->f_dentry->d_inode->i_mode))
membacked = BDI_CAN_MAP_COPY;
}

/* can't map directly unless the driver or filesystem is
* willing to tell us where and can't copy and map if we can't
* read it
*/
if (!file->f_op->get_unmapped_area)
membacked &= ~BDI_CAN_MAP_DIRECT;
if (!file->f_op->read)
membacked &= ~BDI_CAN_MAP_COPY;

if (flags & MAP_SHARED) {
/* do checks for writing, appending and locking */
if ((prot & PROT_WRITE) &&
!(file->f_mode & FMODE_WRITE))
return -EACCES;

if (IS_APPEND(file->f_dentry->d_inode) &&
(file->f_mode & FMODE_WRITE))
return -EACCES;

if (locks_verify_locked(file->f_dentry->d_inode))
return -EAGAIN;

if (!(membacked & BDI_CAN_MAP_DIRECT))
return -ENODEV;

if (((prot & PROT_READ) && !(membacked & BDI_CAN_READ_MAP)) ||
((prot & PROT_WRITE) && !(membacked & BDI_CAN_WRITE_MAP)) ||
((prot & PROT_EXEC) && !(membacked & BDI_CAN_EXEC_MAP))
) {
printk("MAP_SHARED not completely supported on !MMU\n");
return -EINVAL;
}

/* we mustn't privatise shared mappings */
membacked &= ~BDI_CAN_MAP_COPY;
}
else {
/* we're going to read the file into private memory we
* allocate */
if (!(membacked & BDI_CAN_MAP_COPY))
return -ENODEV;

/* we don't permit a private writable mapping to be
* shared */
if (prot & PROT_WRITE)
membacked &= ~BDI_CAN_MAP_DIRECT;
}

/* handle executable mappings and implied executable
* mappings */
if (file->f_vfsmnt->mnt_flags & MNT_NOEXEC) {
if (prot & PROT_EXEC)
return -EPERM;
}
else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
/* handle implication of PROT_EXEC by PROT_READ */
if (current->personality & READ_IMPLIES_EXEC) {
if (membacked & BDI_CAN_EXEC_MAP)
prot |= PROT_EXEC;
}
}
else if ((prot & PROT_READ) &&
(prot & PROT_EXEC) &&
!(membacked & BDI_CAN_EXEC_MAP)
) {
/* backing file is not executable, try to copy */
membacked &= ~BDI_CAN_MAP_DIRECT;
}
}
else {
/* anonymous mappings are always memory backed and can be
* privately mapped
*/
membacked = BDI_MEMORY_BACKED | BDI_CAN_MAP_COPY;

/* handle PROT_EXEC implication by PROT_READ */
if ((prot & PROT_READ) &&
(current->personality & READ_IMPLIES_EXEC))
prot |= PROT_EXEC;
}

/* translate what we now know into VMA flags */
vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags);
vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
/* vm_flags |= mm->def_flags; */

if (!(membacked & BDI_CAN_MAP_DIRECT)) {
/* attempt to share any file segment that's mapped read-only */
if (file && !(prot & PROT_WRITE))
vm_flags |= VM_MAYSHARE;
}
else {
/* overlay a shareable mapping on the backing device or inode
* if possible - used for chardevs and ramfs/tmpfs/shmfs */
if (flags & MAP_SHARED)
vm_flags |= VM_MAYSHARE | VM_SHARED;
else if ((((vm_flags & membacked) ^ vm_flags) & BDI_MEMBACK_VMFLAGS) == 0)
vm_flags |= VM_MAYSHARE;
}

/* refuse to let anyone share private mappings with this process if
* it's being traced - otherwise breakpoints set in it may interfere
* with another untraced process
*/
if ((flags & MAP_PRIVATE) && (current->ptrace & PT_PTRACED))
vm_flags &= ~VM_MAYSHARE;

/* allow the security API to have its say */
ret = security_file_mmap(file, prot, flags);
if (ret)
return ret;

/* we're going to need to record the mapping if it works */
vml = kmalloc(sizeof(struct vm_list_struct), GFP_KERNEL);
if (!vml)
goto error_getting_vml;
memset(vml, 0, sizeof(*vml));

down_write(&nommu_vma_sem);

/* if we want to share, we need to check for VMAs created by other
* mmap() calls that overlap with our proposed mapping
* - we can only share with an exact match on most regular files
* - shared mappings on character devices and memory backed files are
* permitted to overlap inexactly as far as we are concerned for in
* these cases, sharing is handled in the driver or filesystem rather
* than here
*/
if (vm_flags & VM_MAYSHARE) {
unsigned long pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
unsigned long vmpglen;

for (rb = rb_first(&nommu_vma_tree); rb; rb = rb_next(rb)) {
vma = rb_entry(rb, struct vm_area_struct, vm_rb);

if (!(vma->vm_flags & VM_MAYSHARE))
continue;

/* search for overlapping mappings on the same file */
if (vma->vm_file->f_dentry->d_inode != file->f_dentry->d_inode)
continue;

if (vma->vm_pgoff >= pgoff + pglen)
continue;

vmpglen = (vma->vm_end - vma->vm_start + PAGE_SIZE - 1) >> PAGE_SHIFT;
if (pgoff >= vma->vm_pgoff + vmpglen)
continue;

/* handle inexact matches between mappings */
if (vmpglen != pglen || vma->vm_pgoff != pgoff) {
if (!membacked)
goto sharing_violation;
continue;
}

/* we've found a VMA we can share */
atomic_inc(&vma->vm_usage);

vml->vma = vma;
result = (void *) vma->vm_start;
goto shared;
}
}

vma = NULL;

/* obtain the address to map to - we verify (or select) it and ensure
* that it represents a valid section of the address space
* - this is the hook for quasi-memory character devices to tell us
* the location of a shared mapping
*/
if (file && file->f_op->get_unmapped_area) {
addr = file->f_op->get_unmapped_area(file, addr, len, pgoff,
flags);
if (IS_ERR((void *) addr)) {
ret = addr;
if (ret == (unsigned long) -ENOSYS)
ret = (unsigned long) -ENODEV;
goto error;
}
}

/* we're going to need a VMA struct as well */
vma = kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
if (!vma)
goto error_getting_vma;

memset(vma, 0, sizeof(*vma));
INIT_LIST_HEAD(&vma->anon_vma_node);
atomic_set(&vma->vm_usage, 1);
if (file)
get_file(file);
vma->vm_file = file;
vma->vm_flags = vm_flags;
vma->vm_start = addr;
vma->vm_end = addr + len;
vma->vm_pgoff = pgoff;

vml->vma = vma;

/* invoke the mapping routine on a file */
if (file) {
ret = file->f_op->mmap(file, vma);

#ifdef DEBUG
printk("f_op->mmap() returned %d (st=%lx)\n",
ret, vma->vm_start);
#endif
result = (void *) vma->vm_start;
if (!ret)
goto done;
else if (ret != -ENOSYS)
goto error;

/* getting an ENOSYS error indicates that direct mmap isn't
* possible (as opposed to tried but failed) so we'll fall
* through to making a private copy of the data and mapping
* that if we can */
ret = -ENODEV;
if (vma->vm_flags & VM_SHARED)
goto error;

/* quick check that we're allowed to read for a private
* mapping */
if (!(membacked & BDI_CAN_MAP_COPY))
goto error;
}

/* allocate some memory to hold the mapping
* - note that this may not return a page-aligned address if the object
* we're allocating is smaller than a page
*/
ret = -ENOMEM;
result = kmalloc(len, GFP_KERNEL);
if (!result) {
printk("Allocation of length %lu from process %d failed\n",
len, current->pid);
show_free_areas();
goto error;
}

vma->vm_start = (unsigned long) result;
vma->vm_end = vma->vm_start + len;

#ifdef WARN_ON_SLACK
if (len + WARN_ON_SLACK <= kobjsize(result))
printk("Allocation of %lu bytes from process %d has %lu bytes of slack\n",
len, current->pid, kobjsize(result) - len);
#endif

if (file) {
mm_segment_t old_fs = get_fs();
loff_t fpos;

fpos = pgoff;
fpos <<= PAGE_SHIFT;

set_fs(KERNEL_DS);
ret = file->f_op->read(file, (char *) result, len, &fpos);
set_fs(old_fs);

if (ret < 0)
goto error2;
if (ret < len)
memset(result + ret, 0, len - ret);
} else {
memset(result, 0, len);
}

done:
if (!(vma->vm_flags & VM_SHARED)) {
realalloc += kobjsize(result);
askedalloc += len;
}

realalloc += kobjsize(vma);
askedalloc += sizeof(*vma);

current->mm->total_vm += len >> PAGE_SHIFT;

add_nommu_vma(vma);
shared:
realalloc += kobjsize(vml);
askedalloc += sizeof(*vml);

vml->next = current->mm->context.vmlist;
current->mm->context.vmlist = vml;

up_write(&nommu_vma_sem);

if (prot & PROT_EXEC)
flush_icache_range((unsigned long) result,
(unsigned long) result + len);

#ifdef DEBUG
printk("do_mmap:\n");
show_process_blocks();
#endif

return (unsigned long) result;

error2:
kfree(result);
error:
up_write(&nommu_vma_sem);
kfree(vml);
if (vma) {
fput(vma->vm_file);
kfree(vma);
}
return ret;

sharing_violation:
up_write(&nommu_vma_sem);
printk("Attempt to share mismatched mappings\n");
kfree(vml);
return -EINVAL;

error_getting_vma:
up_write(&nommu_vma_sem);
kfree(vml);
printk("Allocation of vml for %lu byte allocation from process %d failed\n",
len, current->pid);
show_free_areas();
return -ENOMEM;

error_getting_vml:
printk("Allocation of vml for %lu byte allocation from process %d failed\n",
len, current->pid);
show_free_areas();
return -ENOMEM;
}


2005-03-02 17:21:54

by Andrew Morton

[permalink] [raw]
Subject: Re: Improving mmap() support for !MMU further

David Howells <[email protected]> wrote:
>
>
> Hi Linus,
>
> I'd like to improve mmap() support on !MMU still further by overloading struct
> backing_dev_info::memory_backed to hold flags describing what the backing
> device is capable of with respect to direct memory access.

->memory_backed is already overloaded in various awkward ways. It would be
good to fix that up first, then add enhancements in a later patch.

Furthermore I'd suggest that we remove ->memory_backed and simply add a
bunch of new booleans to backing_dev_info rather than futzing with
bitflags. Those booleans could be bitfields, of course.

And those fields should refer to device capabilities and not to device
characteristics: "what it can do" rather than "what it is". So
"contributes to dirty memory" and "needs writeback" are good and "is a ram
disk thingy" is bad.

So in 2.6.11 as-is, we have:

bdi->dirty_memory_acct Contributes to dirty memory accounting
bdi->needs_writeback Needs writeback

> (*) If bdi->memory_backed is 0, then the backing device is not accessible as
> memory.

Not sure what this means.

> (*) If the bdi->memory_backed is non-zero, then it's a bit mask of the
> following:
>
> #define BDI_NO_WRITEBACK 0x00000001

Invert the sense of this, use ->needs_writeback

> #define BDI_MEMORY_BACKED 0x00000001

Lose this.

> #define BDI_CAN_MAP_DIRECT 0x00000002

bdi->can_map_direct

> #define BDI_CAN_MAP_COPY 0x00000004

bdi->can_map_copy

> #define BDI_CAN_READ_MAP VM_MAYREAD
> #define BDI_CAN_WRITE_MAP VM_MAYWRITE
> #define BDI_CAN_EXEC_MAP VM_MAYEXEC

A separate field in backing_dev_info?


That `membacked' stuff in the !NOMMU implementation of do_mmap_pgoff()
looks like it could benefit from some rework as a result of the above as
well. It all looks rather inferential.

2005-03-02 21:51:00

by David Howells

[permalink] [raw]
Subject: [PATCH 1/2] BDI: Provide backing device capability information


The attached patch does two things:

(1) It gets rid of backing_dev_info::memory_backed and replaces it with a
pair of boolean values:

(*) dirty_memory_acct

True if the pages associated with this backing device should be
tracked by dirty page accounting.

(*) writeback_if_dirty

True if the pages associated with this backing device should have
writepage() or writepages() invoked upon them to clean them.

(2) It adds a backing device capability mask that indicates what a backing
device is capable of; currently only in regard to memory mapping
facilities. These flags indicate whether a device can be mapped directly,
whether it can be copied for a mapping, and whether direct mappings can
be read, written and/or executed. This information is primarily aimed at
improving no-MMU private mapping support.


Signed-Off-By: David Howells <[email protected]>
---
warthog>diffstat -p1 memback-bdicap-2611rc4.diff
drivers/block/ll_rw_blk.c | 3 ++-
drivers/block/rd.c | 14 ++++++++------
drivers/char/mem.c | 6 ++++++
fs/buffer.c | 2 +-
fs/fs-writeback.c | 4 ++--
fs/hugetlbfs/inode.c | 5 +++--
fs/ramfs/inode.c | 7 +++++--
fs/sysfs/inode.c | 5 +++--
include/linux/backing-dev.h | 22 +++++++++++++++++++++-
mm/filemap.c | 6 +++---
mm/page-writeback.c | 6 +++---
mm/readahead.c | 8 +++++---
mm/shmem.c | 7 ++++---
mm/swap_state.c | 5 +++--
14 files changed, 69 insertions(+), 31 deletions(-)

diff -uNrp /warthog/kernels/linux-2.6.11-rc4/include/linux/backing-dev.h linux-2.6.11-rc4-memback/include/linux/backing-dev.h
--- /warthog/kernels/linux-2.6.11-rc4/include/linux/backing-dev.h 2004-06-18 13:44:05.000000000 +0100
+++ linux-2.6.11-rc4-memback/include/linux/backing-dev.h 2005-03-02 21:06:31.733471683 +0000
@@ -25,13 +25,33 @@ typedef int (congested_fn)(void *, int);
struct backing_dev_info {
unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */
unsigned long state; /* Always use atomic bitops on this */
- int memory_backed; /* Cannot clean pages with writepage */
+ unsigned short capabilities; /* Device capabilities */
+ unsigned dirty_memory_acct : 1; /* Contributes to dirty memory accounting */
+ unsigned writeback_if_dirty : 1; /* Dirty memory should be written back */
congested_fn *congested_fn; /* Function pointer if device is md/dm */
void *congested_data; /* Pointer to aux data for congested func */
void (*unplug_io_fn)(struct backing_dev_info *, struct page *);
void *unplug_io_data;
};

+
+/*
+ * Flags in backing_dev_info::capability
+ * - These flags let !MMU mmap() govern direct device mapping vs immediate
+ * copying more easily for MAP_PRIVATE, especially for ROM filesystems
+ */
+#define BDI_CAP_MAP_COPY 0x00000001 /* Copy can be mapped (MAP_PRIVATE) */
+#define BDI_CAP_MAP_DIRECT 0x00000002 /* Can be mapped directly (MAP_SHARED) */
+#define BDI_CAP_READ_MAP VM_MAYREAD /* Can be mapped for reading */
+#define BDI_CAP_WRITE_MAP VM_MAYWRITE /* Can be mapped for writing */
+#define BDI_CAP_EXEC_MAP VM_MAYEXEC /* Can be mapped for execution */
+#define BDI_CAP_VMFLAGS \
+ (BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP)
+
+#if (BDI_CAP_MAP_DIRECT | BDI_CAP_COPY_FOR_MAP) & BDI_CAP_VMFLAGS
+#error please change backing_dev_info::capabilities flags
+#endif
+
extern struct backing_dev_info default_backing_dev_info;
void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page);

diff -uNrp /warthog/kernels/linux-2.6.11-rc4/drivers/block/ll_rw_blk.c linux-2.6.11-rc4-memback/drivers/block/ll_rw_blk.c
--- /warthog/kernels/linux-2.6.11-rc4/drivers/block/ll_rw_blk.c 2005-02-14 12:18:23.000000000 +0000
+++ linux-2.6.11-rc4-memback/drivers/block/ll_rw_blk.c 2005-03-02 18:19:33.000000000 +0000
@@ -238,7 +238,8 @@ void blk_queue_make_request(request_queu
q->make_request_fn = mfn;
q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
q->backing_dev_info.state = 0;
- q->backing_dev_info.memory_backed = 0;
+ q->backing_dev_info.writeback_if_dirty = 1;
+ q->backing_dev_info.dirty_memory_acct = 1;
blk_queue_max_sectors(q, MAX_SECTORS);
blk_queue_hardsect_size(q, 512);
blk_queue_dma_alignment(q, 511);
diff -uNrp /warthog/kernels/linux-2.6.11-rc4/drivers/block/rd.c linux-2.6.11-rc4-memback/drivers/block/rd.c
--- /warthog/kernels/linux-2.6.11-rc4/drivers/block/rd.c 2005-01-04 11:13:05.000000000 +0000
+++ linux-2.6.11-rc4-memback/drivers/block/rd.c 2005-03-02 18:57:41.000000000 +0000
@@ -324,9 +324,10 @@ static int rd_ioctl(struct inode *inode,
* writeback and it does not contribute to dirty memory accounting.
*/
static struct backing_dev_info rd_backing_dev_info = {
- .ra_pages = 0, /* No readahead */
- .memory_backed = 1, /* Does not contribute to dirty memory */
- .unplug_io_fn = default_unplug_io_fn,
+ .ra_pages = 0, /* No readahead */
+ .dirty_memory_acct = 0, /* Doesn't contribute to dirty memory */
+ .writeback_if_dirty = 0,
+ .unplug_io_fn = default_unplug_io_fn,
};

/*
@@ -335,9 +336,10 @@ static struct backing_dev_info rd_backin
* memory accounting.
*/
static struct backing_dev_info rd_file_backing_dev_info = {
- .ra_pages = 0, /* No readahead */
- .memory_backed = 0, /* Does contribute to dirty memory */
- .unplug_io_fn = default_unplug_io_fn,
+ .ra_pages = 0, /* No readahead */
+ .dirty_memory_acct = 1, /* Does contribute to dirty memory */
+ .writeback_if_dirty = 1,
+ .unplug_io_fn = default_unplug_io_fn,
};

static int rd_open(struct inode *inode, struct file *filp)
diff -uNrp /warthog/kernels/linux-2.6.11-rc4/drivers/char/mem.c linux-2.6.11-rc4-memback/drivers/char/mem.c
--- /warthog/kernels/linux-2.6.11-rc4/drivers/char/mem.c 2005-01-04 11:13:10.000000000 +0000
+++ linux-2.6.11-rc4-memback/drivers/char/mem.c 2005-03-02 21:06:39.784800247 +0000
@@ -23,6 +23,7 @@
#include <linux/devfs_fs_kernel.h>
#include <linux/ptrace.h>
#include <linux/device.h>
+#include <linux/backing-dev.h>

#include <asm/uaccess.h>
#include <asm/io.h>
@@ -613,6 +614,10 @@ static struct file_operations zero_fops
.mmap = mmap_zero,
};

+static struct backing_dev_info zero_bdi = {
+ .capabilities = BDI_CAP_MAP_COPY,
+};
+
static struct file_operations full_fops = {
.llseek = full_lseek,
.read = read_full,
@@ -659,6 +664,7 @@ static int memory_open(struct inode * in
break;
#endif
case 5:
+ filp->f_mapping->backing_dev_info = &zero_bdi;
filp->f_op = &zero_fops;
break;
case 7:
diff -uNrp /warthog/kernels/linux-2.6.11-rc4/fs/buffer.c linux-2.6.11-rc4-memback/fs/buffer.c
--- /warthog/kernels/linux-2.6.11-rc4/fs/buffer.c 2005-02-14 12:18:50.000000000 +0000
+++ linux-2.6.11-rc4-memback/fs/buffer.c 2005-03-02 18:14:19.000000000 +0000
@@ -876,7 +876,7 @@ int __set_page_dirty_buffers(struct page
if (!TestSetPageDirty(page)) {
spin_lock_irq(&mapping->tree_lock);
if (page->mapping) { /* Race with truncate? */
- if (!mapping->backing_dev_info->memory_backed)
+ if (mapping->backing_dev_info->dirty_memory_acct)
inc_page_state(nr_dirty);
radix_tree_tag_set(&mapping->page_tree,
page_index(page),
diff -uNrp /warthog/kernels/linux-2.6.11-rc4/fs/fs-writeback.c linux-2.6.11-rc4-memback/fs/fs-writeback.c
--- /warthog/kernels/linux-2.6.11-rc4/fs/fs-writeback.c 2005-02-14 12:18:51.000000000 +0000
+++ linux-2.6.11-rc4-memback/fs/fs-writeback.c 2005-03-02 18:15:48.000000000 +0000
@@ -315,7 +315,7 @@ sync_sb_inodes(struct super_block *sb, s
struct backing_dev_info *bdi = mapping->backing_dev_info;
long pages_skipped;

- if (bdi->memory_backed) {
+ if (!bdi->writeback_if_dirty) {
list_move(&inode->i_list, &sb->s_dirty);
if (sb == blockdev_superblock) {
/*
@@ -566,7 +566,7 @@ int write_inode_now(struct inode *inode,
.sync_mode = WB_SYNC_ALL,
};

- if (inode->i_mapping->backing_dev_info->memory_backed)
+ if (!inode->i_mapping->backing_dev_info->writeback_if_dirty)
return 0;

might_sleep();
diff -uNrp /warthog/kernels/linux-2.6.11-rc4/fs/hugetlbfs/inode.c linux-2.6.11-rc4-memback/fs/hugetlbfs/inode.c
--- /warthog/kernels/linux-2.6.11-rc4/fs/hugetlbfs/inode.c 2005-02-14 12:18:51.000000000 +0000
+++ linux-2.6.11-rc4-memback/fs/hugetlbfs/inode.c 2005-03-02 18:34:02.000000000 +0000
@@ -39,8 +39,9 @@ static struct inode_operations hugetlbfs
static struct inode_operations hugetlbfs_inode_operations;

static struct backing_dev_info hugetlbfs_backing_dev_info = {
- .ra_pages = 0, /* No readahead */
- .memory_backed = 1, /* Does not contribute to dirty memory */
+ .ra_pages = 0, /* No readahead */
+ .dirty_memory_acct = 0, /* Doesn't contribute to dirty memory */
+ .writeback_if_dirty = 0,
};

int sysctl_hugetlb_shm_group;
diff -uNrp /warthog/kernels/linux-2.6.11-rc4/fs/ramfs/inode.c linux-2.6.11-rc4-memback/fs/ramfs/inode.c
--- /warthog/kernels/linux-2.6.11-rc4/fs/ramfs/inode.c 2005-02-14 12:18:53.000000000 +0000
+++ linux-2.6.11-rc4-memback/fs/ramfs/inode.c 2005-03-02 18:32:54.000000000 +0000
@@ -44,8 +44,11 @@ static struct inode_operations ramfs_fil
static struct inode_operations ramfs_dir_inode_operations;

static struct backing_dev_info ramfs_backing_dev_info = {
- .ra_pages = 0, /* No readahead */
- .memory_backed = 1, /* Does not contribute to dirty memory */
+ .ra_pages = 0, /* No readahead */
+ .dirty_memory_acct = 0, /* Does not contribute to dirty memory */
+ .writeback_if_dirty = 0,
+ .capabilities = BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY |
+ BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP,
};

struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
diff -uNrp /warthog/kernels/linux-2.6.11-rc4/fs/sysfs/inode.c linux-2.6.11-rc4-memback/fs/sysfs/inode.c
--- /warthog/kernels/linux-2.6.11-rc4/fs/sysfs/inode.c 2005-01-04 11:13:43.000000000 +0000
+++ linux-2.6.11-rc4-memback/fs/sysfs/inode.c 2005-03-02 18:18:43.000000000 +0000
@@ -22,8 +22,9 @@ static struct address_space_operations s
};

static struct backing_dev_info sysfs_backing_dev_info = {
- .ra_pages = 0, /* No readahead */
- .memory_backed = 1, /* Does not contribute to dirty memory */
+ .ra_pages = 0, /* No readahead */
+ .writeback_if_dirty = 0,
+ .dirty_memory_acct = 0, /* Doesn't contribute to dirty memory */
};

struct inode * sysfs_new_inode(mode_t mode)
diff -uNrp /warthog/kernels/linux-2.6.11-rc4/mm/filemap.c linux-2.6.11-rc4-memback/mm/filemap.c
--- /warthog/kernels/linux-2.6.11-rc4/mm/filemap.c 2005-02-14 12:19:04.000000000 +0000
+++ linux-2.6.11-rc4-memback/mm/filemap.c 2005-03-02 17:33:54.000000000 +0000
@@ -172,7 +172,7 @@ static int __filemap_fdatawrite_range(st
.end = end,
};

- if (mapping->backing_dev_info->memory_backed)
+ if (!mapping->backing_dev_info->writeback_if_dirty)
return 0;

ret = do_writepages(mapping, &wbc);
@@ -269,7 +269,7 @@ int sync_page_range(struct inode *inode,
pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
int ret;

- if (mapping->backing_dev_info->memory_backed || !count)
+ if (!mapping->backing_dev_info->writeback_if_dirty || !count)
return 0;
ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
if (ret == 0) {
@@ -295,7 +295,7 @@ int sync_page_range_nolock(struct inode
pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
int ret;

- if (mapping->backing_dev_info->memory_backed || !count)
+ if (!mapping->backing_dev_info->writeback_if_dirty || !count)
return 0;
ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
if (ret == 0)
diff -uNrp /warthog/kernels/linux-2.6.11-rc4/mm/page-writeback.c linux-2.6.11-rc4-memback/mm/page-writeback.c
--- /warthog/kernels/linux-2.6.11-rc4/mm/page-writeback.c 2005-02-14 12:19:04.000000000 +0000
+++ linux-2.6.11-rc4-memback/mm/page-writeback.c 2005-03-02 17:34:43.000000000 +0000
@@ -605,7 +605,7 @@ int __set_page_dirty_nobuffers(struct pa
mapping2 = page_mapping(page);
if (mapping2) { /* Race with truncate? */
BUG_ON(mapping2 != mapping);
- if (!mapping->backing_dev_info->memory_backed)
+ if (mapping->backing_dev_info->dirty_memory_acct)
inc_page_state(nr_dirty);
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
@@ -691,7 +691,7 @@ int test_clear_page_dirty(struct page *p
page_index(page),
PAGECACHE_TAG_DIRTY);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- if (!mapping->backing_dev_info->memory_backed)
+ if (mapping->backing_dev_info->dirty_memory_acct)
dec_page_state(nr_dirty);
return 1;
}
@@ -722,7 +722,7 @@ int clear_page_dirty_for_io(struct page

if (mapping) {
if (TestClearPageDirty(page)) {
- if (!mapping->backing_dev_info->memory_backed)
+ if (mapping->backing_dev_info->dirty_memory_acct)
dec_page_state(nr_dirty);
return 1;
}
diff -uNrp /warthog/kernels/linux-2.6.11-rc4/mm/readahead.c linux-2.6.11-rc4-memback/mm/readahead.c
--- /warthog/kernels/linux-2.6.11-rc4/mm/readahead.c 2005-02-14 12:19:04.000000000 +0000
+++ linux-2.6.11-rc4-memback/mm/readahead.c 2005-03-02 19:12:36.000000000 +0000
@@ -21,9 +21,11 @@ void default_unplug_io_fn(struct backing
EXPORT_SYMBOL(default_unplug_io_fn);

struct backing_dev_info default_backing_dev_info = {
- .ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE,
- .state = 0,
- .unplug_io_fn = default_unplug_io_fn,
+ .ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE,
+ .state = 0,
+ .dirty_memory_acct = 1,
+ .writeback_if_dirty = 1,
+ .unplug_io_fn = default_unplug_io_fn,
};
EXPORT_SYMBOL_GPL(default_backing_dev_info);

diff -uNrp /warthog/kernels/linux-2.6.11-rc4/mm/shmem.c linux-2.6.11-rc4-memback/mm/shmem.c
--- /warthog/kernels/linux-2.6.11-rc4/mm/shmem.c 2005-02-14 12:19:04.000000000 +0000
+++ linux-2.6.11-rc4-memback/mm/shmem.c 2005-03-02 19:27:16.000000000 +0000
@@ -183,9 +183,10 @@ static struct inode_operations shmem_spe
static struct vm_operations_struct shmem_vm_ops;

static struct backing_dev_info shmem_backing_dev_info = {
- .ra_pages = 0, /* No readahead */
- .memory_backed = 1, /* Does not contribute to dirty memory */
- .unplug_io_fn = default_unplug_io_fn,
+ .ra_pages = 0, /* No readahead */
+ .dirty_memory_acct = 0, /* Doesn't contribute to dirty memory */
+ .writeback_if_dirty = 0,
+ .unplug_io_fn = default_unplug_io_fn,
};

static LIST_HEAD(shmem_swaplist);
diff -uNrp /warthog/kernels/linux-2.6.11-rc4/mm/swap_state.c linux-2.6.11-rc4-memback/mm/swap_state.c
--- /warthog/kernels/linux-2.6.11-rc4/mm/swap_state.c 2005-02-14 12:19:04.000000000 +0000
+++ linux-2.6.11-rc4-memback/mm/swap_state.c 2005-03-02 19:26:58.000000000 +0000
@@ -29,8 +29,9 @@ static struct address_space_operations s
};

static struct backing_dev_info swap_backing_dev_info = {
- .memory_backed = 1, /* Does not contribute to dirty memory */
- .unplug_io_fn = swap_unplug_io_fn,
+ .dirty_memory_acct = 0, /* Doesn't contribute to dirty memory */
+ .writeback_if_dirty = 0,
+ .unplug_io_fn = swap_unplug_io_fn,
};

struct address_space swapper_space = {

2005-03-02 21:57:51

by David Howells

[permalink] [raw]
Subject: [PATCH 2/2] BDI: Improve nommu mmap support


The attached patch makes use of patch #1 to improve nommu mmap support;
particularly in terms on supporting private mappings. It does this by
examining the device capability mask now in the backing_dev_info structure.

Private mappings will now be backed by the underlying device directly if
possible, where "possible" is constrained by the protection mask parameter and
the device capabilities mask.

I've also split the do_mmap_pgoff() function contents into a number of
auxilliary functions to make it easier to understand.

The documentation is also updated; including the addition of a warning about
permitting direct mapping of flash chips and the problems of XIP vs write.

Signed-Off-By: David Howells <[email protected]>
---
warthog>diffstat -p1 memback-nommu-2611rc4.diff
Documentation/nommu-mmap.txt | 81 +++++-
include/linux/mm.h | 1
mm/nommu.c | 519 ++++++++++++++++++++++++++-----------------
3 files changed, 384 insertions(+), 217 deletions(-)

diff -uNrp /warthog/kernels/linux-2.6.11-rc4/include/linux/mm.h linux-2.6.11-rc4-memback/include/linux/mm.h
--- /warthog/kernels/linux-2.6.11-rc4/include/linux/mm.h 2005-02-14 12:19:01.000000000 +0000
+++ linux-2.6.11-rc4-memback/include/linux/mm.h 2005-03-02 20:29:50.798818320 +0000
@@ -164,6 +164,7 @@ extern unsigned int kobjsize(const void
#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */
#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
#define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */
+#define VM_MAPPED_COPY 0x01000000 /* T if mapped copy of data (nommu mmap) */

#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */
#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
diff -uNrp /warthog/kernels/linux-2.6.11-rc4/mm/nommu.c linux-2.6.11-rc4-memback/mm/nommu.c
--- /warthog/kernels/linux-2.6.11-rc4/mm/nommu.c 2005-02-14 12:19:04.000000000 +0000
+++ linux-2.6.11-rc4-memback/mm/nommu.c 2005-03-02 20:50:04.063807220 +0000
@@ -256,29 +256,6 @@ asmlinkage unsigned long sys_brk(unsigne
return mm->brk = brk;
}

-/*
- * Combine the mmap "prot" and "flags" argument into one "vm_flags" used
- * internally. Essentially, translate the "PROT_xxx" and "MAP_xxx" bits
- * into "VM_xxx".
- */
-static inline unsigned long calc_vm_flags(unsigned long prot, unsigned long flags)
-{
-#define _trans(x,bit1,bit2) \
-((bit1==bit2)?(x&bit1):(x&bit1)?bit2:0)
-
- unsigned long prot_bits, flag_bits;
- prot_bits =
- _trans(prot, PROT_READ, VM_READ) |
- _trans(prot, PROT_WRITE, VM_WRITE) |
- _trans(prot, PROT_EXEC, VM_EXEC);
- flag_bits =
- _trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN) |
- _trans(flags, MAP_DENYWRITE, VM_DENYWRITE) |
- _trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE);
- return prot_bits | flag_bits;
-#undef _trans
-}
-
#ifdef DEBUG
static void show_process_blocks(void)
{
@@ -381,29 +358,32 @@ static void delete_nommu_vma(struct vm_a
}

/*
- * handle mapping creation for uClinux
+ * determine whether a mapping should be permitted and, if so, what sort of
+ * mapping we're capable of supporting
*/
-unsigned long do_mmap_pgoff(struct file *file,
- unsigned long addr,
- unsigned long len,
- unsigned long prot,
- unsigned long flags,
- unsigned long pgoff)
+static int validate_mmap_request(struct file *file,
+ unsigned long addr,
+ unsigned long len,
+ unsigned long prot,
+ unsigned long flags,
+ unsigned long pgoff,
+ unsigned long *_capabilities)
{
- struct vm_list_struct *vml = NULL;
- struct vm_area_struct *vma = NULL;
- struct rb_node *rb;
- unsigned int vm_flags;
- void *result;
- int ret, membacked;
+ unsigned long capabilities;
+ int ret;

/* do the simple checks first */
if (flags & MAP_FIXED || addr) {
- printk(KERN_DEBUG "%d: Can't do fixed-address/overlay mmap of RAM\n",
+ printk(KERN_DEBUG
+ "%d: Can't do fixed-address/overlay mmap of RAM\n",
current->pid);
return -EINVAL;
}

+ if ((flags & MAP_TYPE) != MAP_PRIVATE &&
+ (flags & MAP_TYPE) != MAP_SHARED)
+ return -EINVAL;
+
if (PAGE_ALIGN(len) == 0)
return addr;

@@ -414,35 +394,58 @@ unsigned long do_mmap_pgoff(struct file
if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
return -EINVAL;

- /* validate file mapping requests */
- membacked = 0;
if (file) {
+ /* validate file mapping requests */
+ struct address_space *mapping;
+
/* files must support mmap */
if (!file->f_op || !file->f_op->mmap)
return -ENODEV;

- if ((prot & PROT_EXEC) &&
- (file->f_vfsmnt->mnt_flags & MNT_NOEXEC))
- return -EPERM;
-
/* work out if what we've got could possibly be shared
* - we support chardevs that provide their own "memory"
* - we support files/blockdevs that are memory backed
*/
- if (S_ISCHR(file->f_dentry->d_inode->i_mode)) {
- membacked = 1;
- }
- else {
- struct address_space *mapping = file->f_mapping;
- if (!mapping)
- mapping = file->f_dentry->d_inode->i_mapping;
- if (mapping && mapping->backing_dev_info)
- membacked = mapping->backing_dev_info->memory_backed;
+ mapping = file->f_mapping;
+ if (!mapping)
+ mapping = file->f_dentry->d_inode->i_mapping;
+
+ capabilities = 0;
+ if (mapping && mapping->backing_dev_info)
+ capabilities = mapping->backing_dev_info->capabilities;
+
+ if (!capabilities) {
+ /* no explicit capabilities set, so assume some
+ * defaults */
+ switch (file->f_dentry->d_inode->i_mode & S_IFMT) {
+ case S_IFREG:
+ case S_IFBLK:
+ capabilities = BDI_CAP_MAP_COPY;
+ break;
+
+ case S_IFCHR:
+ capabilities =
+ BDI_CAP_MAP_DIRECT |
+ BDI_CAP_READ_MAP |
+ BDI_CAP_WRITE_MAP;
+ break;
+
+ default:
+ return -EINVAL;
+ }
}

+ /* eliminate any capabilities that we can't support on this
+ * device */
+ if (!file->f_op->get_unmapped_area)
+ capabilities &= ~BDI_CAP_MAP_DIRECT;
+ if (!file->f_op->read)
+ capabilities &= ~BDI_CAP_MAP_COPY;
+
if (flags & MAP_SHARED) {
/* do checks for writing, appending and locking */
- if ((prot & PROT_WRITE) && !(file->f_mode & FMODE_WRITE))
+ if ((prot & PROT_WRITE) &&
+ !(file->f_mode & FMODE_WRITE))
return -EACCES;

if (IS_APPEND(file->f_dentry->d_inode) &&
@@ -452,64 +455,243 @@ unsigned long do_mmap_pgoff(struct file
if (locks_verify_locked(file->f_dentry->d_inode))
return -EAGAIN;

- if (!membacked) {
+ if (!(capabilities & BDI_CAP_MAP_DIRECT))
+ return -ENODEV;
+
+ if (((prot & PROT_READ) && !(capabilities & BDI_CAP_READ_MAP)) ||
+ ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) ||
+ ((prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP))
+ ) {
printk("MAP_SHARED not completely supported on !MMU\n");
return -EINVAL;
}

- /* we require greater support from the driver or
- * filesystem - we ask it to tell us what memory to
- * use */
- if (!file->f_op->get_unmapped_area)
- return -ENODEV;
+ /* we mustn't privatise shared mappings */
+ capabilities &= ~BDI_CAP_MAP_COPY;
}
else {
- /* we read private files into memory we allocate */
- if (!file->f_op->read)
+ /* we're going to read the file into private memory we
+ * allocate */
+ if (!(capabilities & BDI_CAP_MAP_COPY))
return -ENODEV;
+
+ /* we don't permit a private writable mapping to be
+ * shared with the backing device */
+ if (prot & PROT_WRITE)
+ capabilities &= ~BDI_CAP_MAP_DIRECT;
+ }
+
+ /* handle executable mappings and implied executable
+ * mappings */
+ if (file->f_vfsmnt->mnt_flags & MNT_NOEXEC) {
+ if (prot & PROT_EXEC)
+ return -EPERM;
+ }
+ else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
+ /* handle implication of PROT_EXEC by PROT_READ */
+ if (current->personality & READ_IMPLIES_EXEC) {
+ if (capabilities & BDI_CAP_EXEC_MAP)
+ prot |= PROT_EXEC;
+ }
+ }
+ else if ((prot & PROT_READ) &&
+ (prot & PROT_EXEC) &&
+ !(capabilities & BDI_CAP_EXEC_MAP)
+ ) {
+ /* backing file is not executable, try to copy */
+ capabilities &= ~BDI_CAP_MAP_DIRECT;
}
}
+ else {
+ /* anonymous mappings are always memory backed and can be
+ * privately mapped
+ */
+ capabilities = BDI_CAP_MAP_COPY;

- /* handle PROT_EXEC implication by PROT_READ */
- if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
- if (!(file && (file->f_vfsmnt->mnt_flags & MNT_NOEXEC)))
+ /* handle PROT_EXEC implication by PROT_READ */
+ if ((prot & PROT_READ) &&
+ (current->personality & READ_IMPLIES_EXEC))
prot |= PROT_EXEC;
+ }

- /* do simple checking here so the lower-level routines won't have
- * to. we assume access permissions have been handled by the open
- * of the memory object, so we don't do any here.
- */
- vm_flags = calc_vm_flags(prot,flags) /* | mm->def_flags */
- | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
+ /* allow the security API to have its say */
+ ret = security_file_mmap(file, prot, flags);
+ if (ret < 0)
+ return ret;

- if (!membacked) {
- /* share any file segment that's mapped read-only */
- if (((flags & MAP_PRIVATE) && !(prot & PROT_WRITE) && file) ||
- ((flags & MAP_SHARED) && !(prot & PROT_WRITE) && file))
- vm_flags |= VM_MAYSHARE;
+ /* looks okay */
+ *_capabilities = capabilities;
+ return 0;
+}

- /* refuse to let anyone share files with this process if it's being traced -
- * otherwise breakpoints set in it may interfere with another untraced process
- */
- if (current->ptrace & PT_PTRACED)
- vm_flags &= ~(VM_SHARED | VM_MAYSHARE);
+/*
+ * we've determined that we can make the mapping, now translate what we
+ * now know into VMA flags
+ */
+static unsigned long determine_vm_flags(struct file *file,
+ unsigned long prot,
+ unsigned long flags,
+ unsigned long capabilities)
+{
+ unsigned long vm_flags;
+
+ vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags);
+ vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
+ /* vm_flags |= mm->def_flags; */
+
+ if (!(capabilities & BDI_CAP_MAP_DIRECT)) {
+ /* attempt to share read-only copies of mapped file chunks */
+ if (file && !(prot & PROT_WRITE))
+ vm_flags |= VM_MAYSHARE;
}
else {
- /* permit sharing of character devices and ramfs files at any time for
- * anything other than a privately writable mapping
- */
- if (!(flags & MAP_PRIVATE) || !(prot & PROT_WRITE)) {
+ /* overlay a shareable mapping on the backing device or inode
+ * if possible - used for chardevs, ramfs/tmpfs/shmfs and
+ * romfs/cramfs */
+ if (flags & MAP_SHARED)
+ vm_flags |= VM_MAYSHARE | VM_SHARED;
+ else if ((((vm_flags & capabilities) ^ vm_flags) & BDI_CAP_VMFLAGS) == 0)
vm_flags |= VM_MAYSHARE;
- if (flags & MAP_SHARED)
- vm_flags |= VM_SHARED;
+ }
+
+ /* refuse to let anyone share private mappings with this process if
+ * it's being traced - otherwise breakpoints set in it may interfere
+ * with another untraced process
+ */
+ if ((flags & MAP_PRIVATE) && (current->ptrace & PT_PTRACED))
+ vm_flags &= ~VM_MAYSHARE;
+
+ return vm_flags;
+}
+
+/*
+ * set up a shared mapping on a file
+ */
+static int do_mmap_shared_file(struct vm_area_struct *vma, unsigned long len)
+{
+ int ret;
+
+ ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
+ if (ret != -ENOSYS)
+ return ret;
+
+ /* getting an ENOSYS error indicates that direct mmap isn't
+ * possible (as opposed to tried but failed) so we'll fall
+ * through to making a private copy of the data and mapping
+ * that if we can */
+ return -ENODEV;
+}
+
+/*
+ * set up a private mapping or an anonymous shared mapping
+ */
+static int do_mmap_private(struct vm_area_struct *vma, unsigned long len)
+{
+ void *base;
+ int ret;
+
+ /* invoke the file's mapping function so that it can keep track of
+ * shared mappings on devices or memory
+ * - VM_MAYSHARE will be set if it may attempt to share
+ */
+ if (vma->vm_file) {
+ ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
+ if (ret != -ENOSYS) {
+ /* shouldn't return success if we're not sharing */
+ BUG_ON(ret == 0 && !(vma->vm_flags & VM_MAYSHARE));
+ return ret; /* success or a real error */
}
+
+ /* getting an ENOSYS error indicates that direct mmap isn't
+ * possible (as opposed to tried but failed) so we'll try to
+ * make a private copy of the data and map that instead */
}

- /* allow the security API to have its say */
- ret = security_file_mmap(file, prot, flags);
- if (ret)
+ /* allocate some memory to hold the mapping
+ * - note that this may not return a page-aligned address if the object
+ * we're allocating is smaller than a page
+ */
+ base = kmalloc(len, GFP_KERNEL);
+ if (!base)
+ goto enomem;
+
+ vma->vm_start = (unsigned long) base;
+ vma->vm_end = vma->vm_start + len;
+ vma->vm_flags |= VM_MAPPED_COPY;
+
+#ifdef WARN_ON_SLACK
+ if (len + WARN_ON_SLACK <= kobjsize(result))
+ printk("Allocation of %lu bytes from process %d has %lu bytes of slack\n",
+ len, current->pid, kobjsize(result) - len);
+#endif
+
+ if (vma->vm_file) {
+ /* read the contents of a file into the copy */
+ mm_segment_t old_fs;
+ loff_t fpos;
+
+ fpos = vma->vm_pgoff;
+ fpos <<= PAGE_SHIFT;
+
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ ret = vma->vm_file->f_op->read(vma->vm_file, base, len, &fpos);
+ set_fs(old_fs);
+
+ if (ret < 0)
+ goto error_free;
+
+ /* clear the last little bit */
+ if (ret < len)
+ memset(base + ret, 0, len - ret);
+
+ } else {
+ /* if it's an anonymous mapping, then just clear it */
+ memset(base, 0, len);
+ }
+
+ return 0;
+
+error_free:
+ kfree(base);
+ vma->vm_start = 0;
+ return ret;
+
+enomem:
+ printk("Allocation of length %lu from process %d failed\n",
+ len, current->pid);
+ show_free_areas();
+ return -ENOMEM;
+}
+
+/*
+ * handle mapping creation for uClinux
+ */
+unsigned long do_mmap_pgoff(struct file *file,
+ unsigned long addr,
+ unsigned long len,
+ unsigned long prot,
+ unsigned long flags,
+ unsigned long pgoff)
+{
+ struct vm_list_struct *vml = NULL;
+ struct vm_area_struct *vma = NULL;
+ struct rb_node *rb;
+ unsigned long capabilities, vm_flags;
+ void *result;
+ int ret;
+
+ /* decide whether we should attempt the mapping, and if so what sort of
+ * mapping */
+ ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
+ &capabilities);
+ if (ret < 0)
return ret;

+ /* we've determined that we can make the mapping, now translate what we
+ * now know into VMA flags */
+ vm_flags = determine_vm_flags(file, prot, flags, capabilities);
+
/* we're going to need to record the mapping if it works */
vml = kmalloc(sizeof(struct vm_list_struct), GFP_KERNEL);
if (!vml)
@@ -518,8 +700,8 @@ unsigned long do_mmap_pgoff(struct file

down_write(&nommu_vma_sem);

- /* if we want to share, we need to search for VMAs created by another
- * mmap() call that overlap with our proposed mapping
+ /* if we want to share, we need to check for VMAs created by other
+ * mmap() calls that overlap with our proposed mapping
* - we can only share with an exact match on most regular files
* - shared mappings on character devices and memory backed files are
* permitted to overlap inexactly as far as we are concerned for in
@@ -543,13 +725,14 @@ unsigned long do_mmap_pgoff(struct file
if (vma->vm_pgoff >= pgoff + pglen)
continue;

- vmpglen = (vma->vm_end - vma->vm_start + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ vmpglen = vma->vm_end - vma->vm_start + PAGE_SIZE - 1;
+ vmpglen >>= PAGE_SHIFT;
if (pgoff >= vma->vm_pgoff + vmpglen)
continue;

- /* handle inexact matches between mappings */
- if (vmpglen != pglen || vma->vm_pgoff != pgoff) {
- if (!membacked)
+ /* handle inexactly overlapping matches between mappings */
+ if (vma->vm_pgoff != pgoff || vmpglen != pglen) {
+ if (!(capabilities & BDI_CAP_MAP_DIRECT))
goto sharing_violation;
continue;
}
@@ -561,21 +744,30 @@ unsigned long do_mmap_pgoff(struct file
result = (void *) vma->vm_start;
goto shared;
}
- }

- vma = NULL;
+ vma = NULL;

- /* obtain the address to map to. we verify (or select) it and ensure
- * that it represents a valid section of the address space
- * - this is the hook for quasi-memory character devices
- */
- if (file && file->f_op->get_unmapped_area) {
- addr = file->f_op->get_unmapped_area(file, addr, len, pgoff, flags);
- if (IS_ERR((void *) addr)) {
- ret = addr;
- if (ret == (unsigned long) -ENOSYS)
+ /* obtain the address at which to make a shared mapping
+ * - this is the hook for quasi-memory character devices to
+ * tell us the location of a shared mapping
+ */
+ if (file && file->f_op->get_unmapped_area) {
+ addr = file->f_op->get_unmapped_area(file, addr, len,
+ pgoff, flags);
+ if (IS_ERR((void *) addr)) {
+ ret = addr;
+ if (ret != (unsigned long) -ENOSYS)
+ goto error;
+
+ /* the driver refused to tell us where to site
+ * the mapping so we'll have to attempt to copy
+ * it */
ret = (unsigned long) -ENODEV;
- goto error;
+ if (!(capabilities & BDI_CAP_MAP_COPY))
+ goto error;
+
+ capabilities &= ~BDI_CAP_MAP_DIRECT;
+ }
}
}

@@ -597,96 +789,18 @@ unsigned long do_mmap_pgoff(struct file

vml->vma = vma;

- /* determine the object being mapped and call the appropriate specific
- * mapper.
- */
- if (file) {
-#ifdef MAGIC_ROM_PTR
- /* First, try simpler routine designed to give us a ROM pointer. */
- if (file->f_op->romptr && !(prot & PROT_WRITE)) {
- ret = file->f_op->romptr(file, vma);
-#ifdef DEBUG
- printk("romptr mmap returned %d (st=%lx)\n",
- ret, vma->vm_start);
-#endif
- result = (void *) vma->vm_start;
- if (!ret)
- goto done;
- else if (ret != -ENOSYS)
- goto error;
- } else
-#endif /* MAGIC_ROM_PTR */
- /* Then try full mmap routine, which might return a RAM
- * pointer, or do something truly complicated
- */
- if (file->f_op->mmap) {
- ret = file->f_op->mmap(file, vma);
-
-#ifdef DEBUG
- printk("f_op->mmap() returned %d (st=%lx)\n",
- ret, vma->vm_start);
-#endif
- result = (void *) vma->vm_start;
- if (!ret)
- goto done;
- else if (ret != -ENOSYS)
- goto error;
- } else {
- ret = -ENODEV; /* No mapping operations defined */
- goto error;
- }
-
- /* An ENOSYS error indicates that mmap isn't possible (as
- * opposed to tried but failed) so we'll fall through to the
- * copy. */
- }
-
- /* allocate some memory to hold the mapping
- * - note that this may not return a page-aligned address if the object
- * we're allocating is smaller than a page
- */
- ret = -ENOMEM;
- result = kmalloc(len, GFP_KERNEL);
- if (!result) {
- printk("Allocation of length %lu from process %d failed\n",
- len, current->pid);
- show_free_areas();
+ /* set up the mapping */
+ if (file && vma->vm_flags & VM_SHARED)
+ ret = do_mmap_shared_file(vma, len);
+ else
+ ret = do_mmap_private(vma, len);
+ if (ret < 0)
goto error;
- }
-
- vma->vm_start = (unsigned long) result;
- vma->vm_end = vma->vm_start + len;

-#ifdef WARN_ON_SLACK
- if (len + WARN_ON_SLACK <= kobjsize(result))
- printk("Allocation of %lu bytes from process %d has %lu bytes of slack\n",
- len, current->pid, kobjsize(result) - len);
-#endif
+ /* okay... we have a mapping; now we have to register it */
+ result = (void *) vma->vm_start;

- if (file) {
- mm_segment_t old_fs = get_fs();
- loff_t fpos;
-
- fpos = pgoff;
- fpos <<= PAGE_SHIFT;
-
- set_fs(KERNEL_DS);
- ret = file->f_op->read(file, (char *) result, len, &fpos);
- set_fs(old_fs);
-
- if (ret < 0)
- goto error2;
- if (ret < len)
- memset(result + ret, 0, len - ret);
- } else {
- memset(result, 0, len);
- }
-
- if (prot & PROT_EXEC)
- flush_icache_range((unsigned long) result, (unsigned long) result + len);
-
- done:
- if (!(vma->vm_flags & VM_SHARED)) {
+ if (vma->vm_flags & VM_MAPPED_COPY) {
realalloc += kobjsize(result);
askedalloc += len;
}
@@ -697,6 +811,7 @@ unsigned long do_mmap_pgoff(struct file
current->mm->total_vm += len >> PAGE_SHIFT;

add_nommu_vma(vma);
+
shared:
realalloc += kobjsize(vml);
askedalloc += sizeof(*vml);
@@ -706,6 +821,10 @@ unsigned long do_mmap_pgoff(struct file

up_write(&nommu_vma_sem);

+ if (prot & PROT_EXEC)
+ flush_icache_range((unsigned long) result,
+ (unsigned long) result + len);
+
#ifdef DEBUG
printk("do_mmap:\n");
show_process_blocks();
@@ -713,8 +832,6 @@ unsigned long do_mmap_pgoff(struct file

return (unsigned long) result;

- error2:
- kfree(result);
error:
up_write(&nommu_vma_sem);
kfree(vml);
@@ -761,7 +878,7 @@ static void put_vma(struct vm_area_struc

/* IO memory and memory shared directly out of the pagecache from
* ramfs/tmpfs mustn't be released here */
- if (!(vma->vm_flags & (VM_IO | VM_SHARED)) && vma->vm_start) {
+ if (vma->vm_flags & VM_MAPPED_COPY) {
realalloc -= kobjsize((void *) vma->vm_start);
askedalloc -= vma->vm_end - vma->vm_start;
kfree((void *) vma->vm_start);
@@ -784,13 +901,6 @@ int do_munmap(struct mm_struct *mm, unsi
struct vm_list_struct *vml, **parent;
unsigned long end = addr + len;

-#ifdef MAGIC_ROM_PTR
- /* For efficiency's sake, if the pointer is obviously in ROM,
- don't bother walking the lists to free it */
- if (is_in_rom(addr))
- return 0;
-#endif
-
#ifdef DEBUG
printk("do_munmap:\n");
#endif
@@ -1062,4 +1172,3 @@ int __vm_enough_memory(long pages, int c

return -ENOMEM;
}
-
diff -uNrp /warthog/kernels/linux-2.6.11-rc4/Documentation/nommu-mmap.txt linux-2.6.11-rc4-memback/Documentation/nommu-mmap.txt
--- /warthog/kernels/linux-2.6.11-rc4/Documentation/nommu-mmap.txt 2005-02-14 12:18:03.000000000 +0000
+++ linux-2.6.11-rc4-memback/Documentation/nommu-mmap.txt 2005-03-02 19:31:37.000000000 +0000
@@ -36,20 +36,35 @@ and it's also much more restricted in th
In the MMU case: VM regions backed by pages read from file; changes to
the underlying file are reflected in the mapping; copied across fork.

- In the no-MMU case: VM regions backed by arbitrary contiguous runs of
- pages into which the appropriate bit of the file is read; any remaining
- bit of the mapping is cleared; such mappings are shared if possible;
- writes to the file do not affect the mapping; writes to the mapping are
- visible in other processes (no MMU protection), but should not happen.
+ In the no-MMU case:
+
+ - If one exists, the kernel will re-use an existing mapping to the
+ same segment of the same file if that has compatible permissions,
+ even if this was created by another process.
+
+ - If possible, the file mapping will be directly on the backing device
+ if the backing device has the BDI_CAP_MAP_DIRECT capability and
+ appropriate mapping protection capabilities. Ramfs, romfs, cramfs
+ and mtd might all permit this.
+
+ - If the backing device device can't or won't permit direct sharing,
+ but does have the BDI_CAP_MAP_COPY capability, then a copy of the
+ appropriate bit of the file will be read into a contiguous bit of
+ memory and any extraneous space beyond the EOF will be cleared
+
+ - Writes to the file do not affect the mapping; writes to the mapping
+ are visible in other processes (no MMU protection), but should not
+ happen.

(*) File, MAP_PRIVATE, PROT_READ / PROT_EXEC, PROT_WRITE

In the MMU case: like the non-PROT_WRITE case, except that the pages in
question get copied before the write actually happens. From that point
- on writes to that page in the file no longer get reflected into the
- mapping's backing pages.
+ on writes to the file underneath that page no longer get reflected into
+ the mapping's backing pages. The page is then backed by swap instead.

- In the no-MMU case: works exactly as for the non-PROT_WRITE case.
+ In the no-MMU case: works much like the non-PROT_WRITE case, except
+ that a copy is always taken and never shared.

(*) Regular file / blockdev, MAP_SHARED, PROT_READ / PROT_EXEC / PROT_WRITE

@@ -70,6 +85,15 @@ and it's also much more restricted in th
as for the MMU case. If the filesystem does not provide any such
support, then the mapping request will be denied.

+ (*) Memory backed blockdev, MAP_SHARED, PROT_READ / PROT_EXEC / PROT_WRITE
+
+ In the MMU case: As for ordinary regular files.
+
+ In the no-MMU case: As for memory backed regular files, but the
+ blockdev must be able to provide a contiguous run of pages without
+ truncate being called. The ramdisk driver could do this if it allocated
+ all its memory as a contiguous array upfront.
+
(*) Memory backed chardev, MAP_SHARED, PROT_READ / PROT_EXEC / PROT_WRITE

In the MMU case: As for ordinary regular files.
@@ -95,12 +119,12 @@ FURTHER NOTES ON NO-MMU MMAP
(*) Supplying MAP_FIXED or a requesting a particular mapping address will
result in an error.

- (*) Files mapped privately must have a read method provided by the driver or
- filesystem so that the contents can be read into the memory allocated. An
+ (*) Files mapped privately usually have to have a read method provided by the
+ driver or filesystem so that the contents can be read into the memory
+ allocated if mmap() chooses not to map the backing device directly. An
error will result if they don't. This is most likely to be encountered
with character device files, pipes, fifos and sockets.

-
============================================
PROVIDING SHAREABLE CHARACTER DEVICE SUPPORT
============================================
@@ -111,6 +135,15 @@ to get a proposed address for the mappin
doesn't wish to honour the mapping because it's too long, at a weird offset,
under some unsupported combination of flags or whatever.

+The driver should also provide backing device information with capabilities set
+to indicate the permitted types of mapping on such devices. The default is
+assumed to be readable and writable, not executable, and only shareable
+directly (can't be copied).
+
+The file->f_op->mmap() operation will be called to actually inaugurate the
+mapping. It can be rejected at that point. Returning the ENOSYS error will
+cause the mapping to be copied instead if BDI_CAP_MAP_COPY is specified.
+
The vm_ops->close() routine will be invoked when the last mapping on a chardev
is removed. An existing mapping will be shared, partially or not, if possible
without notifying the driver.
@@ -120,7 +153,22 @@ return -ENOSYS. This will be taken to me
want to handle it, despite the fact it's got an operation. For instance, it
might try directing the call to a secondary driver which turns out not to
implement it. Such is the case for the framebuffer driver which attempts to
-direct the call to the device-specific driver.
+direct the call to the device-specific driver. Under such circumstances, the
+mapping request will be rejected if BDI_CAP_MAP_COPY is not specified, and a
+copy mapped otherwise.
+
+IMPORTANT NOTE:
+
+ Some types of device may present a different appearance to anyone
+ looking at them in certain modes. Flash chips can be like this; for
+ instance if they're in programming or erase mode, you might see the
+ status reflected in the mapping, instead of the data.
+
+ In such a case, care must be taken lest userspace see a shared or a
+ private mapping showing such information when the driver is busy
+ controlling the device. Remember especially: private executable
+ mappings may still be mapped directly off the device under some
+ circumstances!


==============================================
@@ -139,3 +187,12 @@ memory.

Memory backed devices are indicated by the mapping's backing device info having
the memory_backed flag set.
+
+
+========================================
+PROVIDING SHAREABLE BLOCK DEVICE SUPPORT
+========================================
+
+Provision of shared mappings on block device files is exactly the same as for
+character devices. If there isn't a real device underneath, then the driver
+should allocate sufficient contiguous memory to honour any supported mapping.

2005-03-02 22:41:26

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH 1/2] BDI: Provide backing device capability information



On Wed, 2 Mar 2005, Andrew Morton wrote:
>
> Why not make these bitfields as well?

Side note: bitfields aren't exactly wonderful. They tend to generate worse
code, and they make it much harder to work with combinations of flags
(both testing and initializing). They also have architecture-specific
bit-order and packing issues, which means that when coupled with device
drivers etc they can be a major pain in the derriere.

Even apart from those issues, they can't sanely be atomically accessed, so
in many cases they just aren't a good thing.

In contrast, just using a flag word and explicit bitmask has none of those
problems, and it's often easy to abstract out things with a macro and/or
inline function.

So don't go for bitfields "just because". It's generally a good idea to
_require_ that the code in question has none of the potential problem
spots even in _theory_, and in addition also show that bitfields really
make the code look nicer. The downsides really can be that nasty.

Linus

2005-03-02 22:45:37

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 1/2] BDI: Provide backing device capability information

Linus Torvalds <[email protected]> wrote:
>
> On Wed, 2 Mar 2005, Andrew Morton wrote:
> >
> > Why not make these bitfields as well?
>
> Side note: bitfields aren't exactly wonderful.

Yup. In this application the fields are initialised once (usually at
compile time) and are never modified. So the compiler should be able to
generate the same code as it would with an open-coded bit test. Which is
about the only situation where we should use bitfields, IMO.

That being said, there aren't many backing_dev_info's in a system, so we
won't be saving much memory. Some architectures will presumably generate
faster code with plain old integers. You'd only ever lose if the
backing_dev_info happened to straddle a cacheline boundary. It's marginal.

2005-03-02 22:09:33

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 1/2] BDI: Provide backing device capability information

David Howells <[email protected]> wrote:
>
>
> The attached patch does two things:
>
> (1) It gets rid of backing_dev_info::memory_backed and replaces it with a
> pair of boolean values:
>
> (*) dirty_memory_acct
>
> True if the pages associated with this backing device should be
> tracked by dirty page accounting.
>
> (*) writeback_if_dirty
>
> True if the pages associated with this backing device should have
> writepage() or writepages() invoked upon them to clean them.

Cool, thanks.

> (2) It adds a backing device capability mask that indicates what a backing
> device is capable of; currently only in regard to memory mapping
> facilities. These flags indicate whether a device can be mapped directly,
> whether it can be copied for a mapping, and whether direct mappings can
> be read, written and/or executed. This information is primarily aimed at
> improving no-MMU private mapping support.
>
> ...

> +#define BDI_CAP_MAP_COPY 0x00000001 /* Copy can be mapped (MAP_PRIVATE) */
> +#define BDI_CAP_MAP_DIRECT 0x00000002 /* Can be mapped directly (MAP_SHARED) */

Why not make these bitfields as well?

2005-03-03 13:38:43

by David Howells

[permalink] [raw]
Subject: Re: [PATCH 1/2] BDI: Provide backing device capability information

Andrew Morton <[email protected]> wrote:

>
> > +#define BDI_CAP_MAP_COPY 0x00000001 /* Copy can be mapped (MAP_PRIVATE) */
> > +#define BDI_CAP_MAP_DIRECT 0x00000002 /* Can be mapped directly (MAP_SHARED) */
>
> Why not make these bitfields as well?

Because I want to copy the capabilities mask (including these variables) into
a variable in the nommu mmap implementation and eliminate various bits from
that variable under certain conditions.

Making these into bitfields would result in having to use three variables
instead of just the one.

David

2005-03-03 13:44:25

by David Howells

[permalink] [raw]
Subject: Re: [PATCH 1/2] BDI: Provide backing device capability information

Andrew Morton <[email protected]> wrote:

> Yup. In this application the fields are initialised once (usually at
> compile time) and are never modified.

That's not exactly so. The block layer appears to modify them. See
blk_queue_make_request() in ll_rw_blk.c.

David

2005-03-03 20:43:07

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 1/2] BDI: Provide backing device capability information

David Howells <[email protected]> wrote:
>
> Andrew Morton <[email protected]> wrote:
>
> > Yup. In this application the fields are initialised once (usually at
> > compile time) and are never modified.
>
> That's not exactly so. The block layer appears to modify them. See
> blk_queue_make_request() in ll_rw_blk.c.
>

That's a (strangely named) once-off setup thing.

2005-03-03 20:42:59

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 1/2] BDI: Provide backing device capability information

David Howells <[email protected]> wrote:
>
> Andrew Morton <[email protected]> wrote:
>
> >
> > > +#define BDI_CAP_MAP_COPY 0x00000001 /* Copy can be mapped (MAP_PRIVATE) */
> > > +#define BDI_CAP_MAP_DIRECT 0x00000002 /* Can be mapped directly (MAP_SHARED) */
> >
> > Why not make these bitfields as well?
>
> Because I want to copy the capabilities mask (including these variables) into
> a variable in the nommu mmap implementation and eliminate various bits from
> that variable under certain conditions.
>
> Making these into bitfields would result in having to use three variables
> instead of just the one.

Well let's do one or the other, and not have it half-and-half, please.

2005-03-07 11:24:20

by David Howells

[permalink] [raw]
Subject: Re: [PATCH 1/2] BDI: Provide backing device capability information

Andrew Morton <[email protected]> wrote:

> > Making these into bitfields would result in having to use three variables
> > instead of just the one.
>
> Well let's do one or the other, and not have it half-and-half, please.

So I should fold the two other bitfields back into the capabilities mask and
make it an unsigned long.

David

2005-03-07 11:38:17

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 1/2] BDI: Provide backing device capability information

David Howells <[email protected]> wrote:
>
> Andrew Morton <[email protected]> wrote:
>
> > > Making these into bitfields would result in having to use three variables
> > > instead of just the one.
> >
> > Well let's do one or the other, and not have it half-and-half, please.
>
> So I should fold the two other bitfields back into the capabilities mask and
> make it an unsigned long.

I suppose so. Although unsigned int would be preferable.

2005-03-07 11:43:25

by David Howells

[permalink] [raw]
Subject: Re: [PATCH 1/2] BDI: Provide backing device capability information

Andrew Morton <[email protected]> wrote:

> > So I should fold the two other bitfields back into the capabilities mask
> > and make it an unsigned long.
>
> I suppose so. Although unsigned int would be preferable.

Any particular reason? It's mixed in with other unsigned longs and pointers
after all...

2005-03-07 11:48:32

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 1/2] BDI: Provide backing device capability information

David Howells <[email protected]> wrote:
>
> Andrew Morton <[email protected]> wrote:
>
> > > So I should fold the two other bitfields back into the capabilities mask
> > > and make it an unsigned long.
> >
> > I suppose so. Although unsigned int would be preferable.
>
> Any particular reason? It's mixed in with other unsigned longs and pointers
> after all...

Just that it's the natural wordsize of the machine, and uses less storage.

2005-03-07 11:52:18

by David Howells

[permalink] [raw]
Subject: Re: [PATCH 1/2] BDI: Provide backing device capability information

Andrew Morton <[email protected]> wrote:

> > Any particular reason? It's mixed in with other unsigned longs and pointers
> > after all...
>
> Just that it's the natural wordsize of the machine, and uses less storage.

Making it unsigned long on a 32-bit machine will make no difference. Making it
unsigned int on a 64-bit machine will waste four bytes.

David

2005-03-07 12:11:33

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 1/2] BDI: Provide backing device capability information

David Howells <[email protected]> wrote:
>
> Andrew Morton <[email protected]> wrote:
>
> > > Any particular reason? It's mixed in with other unsigned longs and pointers
> > > after all...
> >
> > Just that it's the natural wordsize of the machine, and uses less storage.
>
> Making it unsigned long on a 32-bit machine will make no difference. Making it
> unsigned int on a 64-bit machine will waste four bytes.
>

No, it won't waste any bytes at all. It won't save any either.

But if someone comes along later and wants to add another int to that
structure, they won't know that your field was needlessly made a long, and
they'll then waste another eight bytes.

2005-03-07 12:25:45

by David Howells

[permalink] [raw]
Subject: Re: [PATCH 1/2] BDI: Provide backing device capability information

Andrew Morton <[email protected]> wrote:

> > Making it unsigned long on a 32-bit machine will make no
> > difference. Making it unsigned int on a 64-bit machine will waste four
> > bytes.
>
> No, it won't waste any bytes at all. It won't save any either.
>
> But if someone comes along later and wants to add another int to that
> structure, they won't know that your field was needlessly made a long, and
> they'll then waste another eight bytes.

Okay... Fair enough.

David

2005-03-07 14:06:24

by David Howells

[permalink] [raw]
Subject: Re: [PATCH 1/2] BDI: Provide backing device capability information


The attached patch replaces backing_dev_info::memory_backed with capabilitied
bitmap. The capabilities available include:

(*) BDI_CAP_ACCOUNT_DIRTY

Set if the pages associated with this backing device should be tracked
by dirty page accounting.

(*) BDI_CAP_WRITEBACK_DIRTY

Set if dirty pages associated with this backing device should have
writepage() or writepages() invoked upon them to clean them.

(*) Capability markers that indicate what a backing device is capable of
with regard to memory mapping facilities. These flags indicate whether a
device can be mapped directly, whether it can be copied for a mapping,
and whether direct mappings can be read, written and/or executed. This
information is primarily aimed at improving no-MMU private mapping
support.

The patch alco provides convenience functions for determining the dirty-page
capabilities available on backing devices directly or on the backing devices
associated with a mapping. These are provided to keep line length down when
checking for the capabilities.

Signed-Off-By: David Howells <[email protected]>
---
warthog>diffstat -p1 memback-bdicap-2611rc4-2.diff
drivers/block/ll_rw_blk.c | 2 +-
drivers/block/rd.c | 6 +++---
drivers/char/mem.c | 6 ++++++
fs/buffer.c | 2 +-
fs/fs-writeback.c | 4 ++--
fs/hugetlbfs/inode.c | 2 +-
fs/ramfs/inode.c | 3 ++-
fs/sysfs/inode.c | 2 +-
include/linux/backing-dev.h | 41 ++++++++++++++++++++++++++++++++++++++++-
mm/filemap.c | 6 +++---
mm/page-writeback.c | 6 +++---
mm/readahead.c | 1 +
mm/shmem.c | 4 ++--
mm/swap_state.c | 2 +-
14 files changed, 67 insertions(+), 20 deletions(-)

diff -uNrp /warthog/kernels/linux-2.6.11-rc4/include/linux/backing-dev.h linux-2.6.11-rc4-memback/include/linux/backing-dev.h
--- /warthog/kernels/linux-2.6.11-rc4/include/linux/backing-dev.h 2004-06-18 13:44:05.000000000 +0100
+++ linux-2.6.11-rc4-memback/include/linux/backing-dev.h 2005-03-07 13:34:16.068878317 +0000
@@ -25,13 +25,39 @@ typedef int (congested_fn)(void *, int);
struct backing_dev_info {
unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */
unsigned long state; /* Always use atomic bitops on this */
- int memory_backed; /* Cannot clean pages with writepage */
+ unsigned int capabilities; /* Device capabilities */
congested_fn *congested_fn; /* Function pointer if device is md/dm */
void *congested_data; /* Pointer to aux data for congested func */
void (*unplug_io_fn)(struct backing_dev_info *, struct page *);
void *unplug_io_data;
};

+
+/*
+ * Flags in backing_dev_info::capability
+ * - The first two flags control whether dirty pages will contribute to the
+ * VM's accounting and whether writepages() should be called for dirty pages
+ * (something that would not, for example, be appropriate for ramfs)
+ * - These flags let !MMU mmap() govern direct device mapping vs immediate
+ * copying more easily for MAP_PRIVATE, especially for ROM filesystems
+ */
+#define BDI_CAP_ACCOUNT_DIRTY 0x00000001 /* Dirty pages should contribute to accounting */
+#define BDI_CAP_WRITEBACK_DIRTY 0x00000002 /* Dirty pages should be written back */
+#define BDI_CAP_MAP_COPY 0x00000004 /* Copy can be mapped (MAP_PRIVATE) */
+#define BDI_CAP_MAP_DIRECT 0x00000008 /* Can be mapped directly (MAP_SHARED) */
+#define BDI_CAP_READ_MAP 0x00000010 /* Can be mapped for reading */
+#define BDI_CAP_WRITE_MAP 0x00000020 /* Can be mapped for writing */
+#define BDI_CAP_EXEC_MAP 0x00000040 /* Can be mapped for execution */
+#define BDI_CAP_VMFLAGS \
+ (BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP)
+
+#if defined(VM_MAYREAD) && \
+ (BDI_CAP_READ_MAP != VM_MAYREAD || \
+ BDI_CAP_WRITE_MAP != VM_MAYWRITE || \
+ BDI_CAP_EXEC_MAP != VM_MAYEXEC)
+#error please change backing_dev_info::capabilities flags
+#endif
+
extern struct backing_dev_info default_backing_dev_info;
void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page);

@@ -62,4 +88,17 @@ static inline int bdi_rw_congested(struc
(1 << BDI_write_congested));
}

+#define bdi_cap_writeback_dirty(bdi) \
+ ((bdi)->capabilities & BDI_CAP_WRITEBACK_DIRTY)
+
+#define bdi_cap_account_dirty(bdi) \
+ ((bdi)->capabilities & BDI_CAP_ACCOUNT_DIRTY)
+
+#define mapping_cap_writeback_dirty(mapping) \
+ bdi_cap_writeback_dirty((mapping)->backing_dev_info)
+
+#define mapping_cap_account_dirty(mapping) \
+ bdi_cap_account_dirty((mapping)->backing_dev_info)
+
+
#endif /* _LINUX_BACKING_DEV_H */
diff -uNrp /warthog/kernels/linux-2.6.11-rc4/drivers/block/ll_rw_blk.c linux-2.6.11-rc4-memback/drivers/block/ll_rw_blk.c
--- /warthog/kernels/linux-2.6.11-rc4/drivers/block/ll_rw_blk.c 2005-02-14 12:18:23.000000000 +0000
+++ linux-2.6.11-rc4-memback/drivers/block/ll_rw_blk.c 2005-03-07 13:35:46.661284198 +0000
@@ -238,7 +238,7 @@ void blk_queue_make_request(request_queu
q->make_request_fn = mfn;
q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
q->backing_dev_info.state = 0;
- q->backing_dev_info.memory_backed = 0;
+ q->backing_dev_info.capabilities = BDI_CAP_ACCOUNT_DIRTY | BDI_CAP_WRITEBACK_DIRTY;
blk_queue_max_sectors(q, MAX_SECTORS);
blk_queue_hardsect_size(q, 512);
blk_queue_dma_alignment(q, 511);
diff -uNrp /warthog/kernels/linux-2.6.11-rc4/drivers/block/rd.c linux-2.6.11-rc4-memback/drivers/block/rd.c
--- /warthog/kernels/linux-2.6.11-rc4/drivers/block/rd.c 2005-01-04 11:13:05.000000000 +0000
+++ linux-2.6.11-rc4-memback/drivers/block/rd.c 2005-03-07 13:51:11.665672065 +0000
@@ -325,7 +325,7 @@ static int rd_ioctl(struct inode *inode,
*/
static struct backing_dev_info rd_backing_dev_info = {
.ra_pages = 0, /* No readahead */
- .memory_backed = 1, /* Does not contribute to dirty memory */
+ .capabilities = 0, /* Doesn't contribute to dirty memory */
.unplug_io_fn = default_unplug_io_fn,
};

@@ -336,8 +336,8 @@ static struct backing_dev_info rd_backin
*/
static struct backing_dev_info rd_file_backing_dev_info = {
.ra_pages = 0, /* No readahead */
- .memory_backed = 0, /* Does contribute to dirty memory */
- .unplug_io_fn = default_unplug_io_fn,
+ .capabilities = 1, /* Does contribute to dirty memory */
+ .unplug_io_fn = default_unplug_io_fn,
};

static int rd_open(struct inode *inode, struct file *filp)
diff -uNrp /warthog/kernels/linux-2.6.11-rc4/drivers/char/mem.c linux-2.6.11-rc4-memback/drivers/char/mem.c
--- /warthog/kernels/linux-2.6.11-rc4/drivers/char/mem.c 2005-01-04 11:13:10.000000000 +0000
+++ linux-2.6.11-rc4-memback/drivers/char/mem.c 2005-03-02 21:06:39.000000000 +0000
@@ -23,6 +23,7 @@
#include <linux/devfs_fs_kernel.h>
#include <linux/ptrace.h>
#include <linux/device.h>
+#include <linux/backing-dev.h>

#include <asm/uaccess.h>
#include <asm/io.h>
@@ -613,6 +614,10 @@ static struct file_operations zero_fops
.mmap = mmap_zero,
};

+static struct backing_dev_info zero_bdi = {
+ .capabilities = BDI_CAP_MAP_COPY,
+};
+
static struct file_operations full_fops = {
.llseek = full_lseek,
.read = read_full,
@@ -659,6 +664,7 @@ static int memory_open(struct inode * in
break;
#endif
case 5:
+ filp->f_mapping->backing_dev_info = &zero_bdi;
filp->f_op = &zero_fops;
break;
case 7:
diff -uNrp /warthog/kernels/linux-2.6.11-rc4/fs/buffer.c linux-2.6.11-rc4-memback/fs/buffer.c
--- /warthog/kernels/linux-2.6.11-rc4/fs/buffer.c 2005-02-14 12:18:50.000000000 +0000
+++ linux-2.6.11-rc4-memback/fs/buffer.c 2005-03-07 13:25:05.000000000 +0000
@@ -876,7 +876,7 @@ int __set_page_dirty_buffers(struct page
if (!TestSetPageDirty(page)) {
spin_lock_irq(&mapping->tree_lock);
if (page->mapping) { /* Race with truncate? */
- if (!mapping->backing_dev_info->memory_backed)
+ if (mapping_cap_account_dirty(mapping))
inc_page_state(nr_dirty);
radix_tree_tag_set(&mapping->page_tree,
page_index(page),
diff -uNrp /warthog/kernels/linux-2.6.11-rc4/fs/fs-writeback.c linux-2.6.11-rc4-memback/fs/fs-writeback.c
--- /warthog/kernels/linux-2.6.11-rc4/fs/fs-writeback.c 2005-02-14 12:18:51.000000000 +0000
+++ linux-2.6.11-rc4-memback/fs/fs-writeback.c 2005-03-07 13:27:24.000000000 +0000
@@ -315,7 +315,7 @@ sync_sb_inodes(struct super_block *sb, s
struct backing_dev_info *bdi = mapping->backing_dev_info;
long pages_skipped;

- if (bdi->memory_backed) {
+ if (!bdi_cap_writeback_dirty(bdi)) {
list_move(&inode->i_list, &sb->s_dirty);
if (sb == blockdev_superblock) {
/*
@@ -566,7 +566,7 @@ int write_inode_now(struct inode *inode,
.sync_mode = WB_SYNC_ALL,
};

- if (inode->i_mapping->backing_dev_info->memory_backed)
+ if (!mapping_cap_writeback_dirty(inode->i_mapping))
return 0;

might_sleep();
diff -uNrp /warthog/kernels/linux-2.6.11-rc4/fs/hugetlbfs/inode.c linux-2.6.11-rc4-memback/fs/hugetlbfs/inode.c
--- /warthog/kernels/linux-2.6.11-rc4/fs/hugetlbfs/inode.c 2005-02-14 12:18:51.000000000 +0000
+++ linux-2.6.11-rc4-memback/fs/hugetlbfs/inode.c 2005-03-07 13:52:36.855520939 +0000
@@ -40,7 +40,7 @@ static struct inode_operations hugetlbfs

static struct backing_dev_info hugetlbfs_backing_dev_info = {
.ra_pages = 0, /* No readahead */
- .memory_backed = 1, /* Does not contribute to dirty memory */
+ .capabilities = 0, /* Doesn't contribute to dirty memory */
};

int sysctl_hugetlb_shm_group;
diff -uNrp /warthog/kernels/linux-2.6.11-rc4/fs/ramfs/inode.c linux-2.6.11-rc4-memback/fs/ramfs/inode.c
--- /warthog/kernels/linux-2.6.11-rc4/fs/ramfs/inode.c 2005-02-14 12:18:53.000000000 +0000
+++ linux-2.6.11-rc4-memback/fs/ramfs/inode.c 2005-03-07 13:29:36.086354487 +0000
@@ -45,7 +45,8 @@ static struct inode_operations ramfs_dir

static struct backing_dev_info ramfs_backing_dev_info = {
.ra_pages = 0, /* No readahead */
- .memory_backed = 1, /* Does not contribute to dirty memory */
+ .capabilities = BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY |
+ BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP,
};

struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
diff -uNrp /warthog/kernels/linux-2.6.11-rc4/fs/sysfs/inode.c linux-2.6.11-rc4-memback/fs/sysfs/inode.c
--- /warthog/kernels/linux-2.6.11-rc4/fs/sysfs/inode.c 2005-01-04 11:13:43.000000000 +0000
+++ linux-2.6.11-rc4-memback/fs/sysfs/inode.c 2005-03-07 13:30:13.374227409 +0000
@@ -23,7 +23,7 @@ static struct address_space_operations s

static struct backing_dev_info sysfs_backing_dev_info = {
.ra_pages = 0, /* No readahead */
- .memory_backed = 1, /* Does not contribute to dirty memory */
+ .capabilities = 0, /* Doesn't contribute to dirty memory */
};

struct inode * sysfs_new_inode(mode_t mode)
diff -uNrp /warthog/kernels/linux-2.6.11-rc4/mm/filemap.c linux-2.6.11-rc4-memback/mm/filemap.c
--- /warthog/kernels/linux-2.6.11-rc4/mm/filemap.c 2005-02-14 12:19:04.000000000 +0000
+++ linux-2.6.11-rc4-memback/mm/filemap.c 2005-03-07 13:25:50.000000000 +0000
@@ -172,7 +172,7 @@ static int __filemap_fdatawrite_range(st
.end = end,
};

- if (mapping->backing_dev_info->memory_backed)
+ if (!mapping_cap_writeback_dirty(mapping))
return 0;

ret = do_writepages(mapping, &wbc);
@@ -269,7 +269,7 @@ int sync_page_range(struct inode *inode,
pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
int ret;

- if (mapping->backing_dev_info->memory_backed || !count)
+ if (!mapping_cap_writeback_dirty(mapping) || !count)
return 0;
ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
if (ret == 0) {
@@ -295,7 +295,7 @@ int sync_page_range_nolock(struct inode
pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
int ret;

- if (mapping->backing_dev_info->memory_backed || !count)
+ if (!mapping_cap_writeback_dirty(mapping) || !count)
return 0;
ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
if (ret == 0)
diff -uNrp /warthog/kernels/linux-2.6.11-rc4/mm/page-writeback.c linux-2.6.11-rc4-memback/mm/page-writeback.c
--- /warthog/kernels/linux-2.6.11-rc4/mm/page-writeback.c 2005-02-14 12:19:04.000000000 +0000
+++ linux-2.6.11-rc4-memback/mm/page-writeback.c 2005-03-07 13:26:25.000000000 +0000
@@ -605,7 +605,7 @@ int __set_page_dirty_nobuffers(struct pa
mapping2 = page_mapping(page);
if (mapping2) { /* Race with truncate? */
BUG_ON(mapping2 != mapping);
- if (!mapping->backing_dev_info->memory_backed)
+ if (mapping_cap_account_dirty(mapping))
inc_page_state(nr_dirty);
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
@@ -691,7 +691,7 @@ int test_clear_page_dirty(struct page *p
page_index(page),
PAGECACHE_TAG_DIRTY);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- if (!mapping->backing_dev_info->memory_backed)
+ if (mapping_cap_account_dirty(mapping))
dec_page_state(nr_dirty);
return 1;
}
@@ -722,7 +722,7 @@ int clear_page_dirty_for_io(struct page

if (mapping) {
if (TestClearPageDirty(page)) {
- if (!mapping->backing_dev_info->memory_backed)
+ if (mapping_cap_account_dirty(mapping))
dec_page_state(nr_dirty);
return 1;
}
diff -uNrp /warthog/kernels/linux-2.6.11-rc4/mm/readahead.c linux-2.6.11-rc4-memback/mm/readahead.c
--- /warthog/kernels/linux-2.6.11-rc4/mm/readahead.c 2005-02-14 12:19:04.000000000 +0000
+++ linux-2.6.11-rc4-memback/mm/readahead.c 2005-03-07 13:20:47.000000000 +0000
@@ -23,6 +23,7 @@ EXPORT_SYMBOL(default_unplug_io_fn);
struct backing_dev_info default_backing_dev_info = {
.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE,
.state = 0,
+ .capabilities = BDI_CAP_ACCOUNT_DIRTY | BDI_CAP_WRITEBACK_DIRTY,
.unplug_io_fn = default_unplug_io_fn,
};
EXPORT_SYMBOL_GPL(default_backing_dev_info);
diff -uNrp /warthog/kernels/linux-2.6.11-rc4/mm/shmem.c linux-2.6.11-rc4-memback/mm/shmem.c
--- /warthog/kernels/linux-2.6.11-rc4/mm/shmem.c 2005-02-14 12:19:04.000000000 +0000
+++ linux-2.6.11-rc4-memback/mm/shmem.c 2005-03-07 13:21:13.000000000 +0000
@@ -184,8 +184,8 @@ static struct vm_operations_struct shmem

static struct backing_dev_info shmem_backing_dev_info = {
.ra_pages = 0, /* No readahead */
- .memory_backed = 1, /* Does not contribute to dirty memory */
- .unplug_io_fn = default_unplug_io_fn,
+ .capabilities = 0, /* Doesn't contribute to dirty memory */
+ .unplug_io_fn = default_unplug_io_fn,
};

static LIST_HEAD(shmem_swaplist);
diff -uNrp /warthog/kernels/linux-2.6.11-rc4/mm/swap_state.c linux-2.6.11-rc4-memback/mm/swap_state.c
--- /warthog/kernels/linux-2.6.11-rc4/mm/swap_state.c 2005-02-14 12:19:04.000000000 +0000
+++ linux-2.6.11-rc4-memback/mm/swap_state.c 2005-03-07 13:20:40.000000000 +0000
@@ -29,7 +29,7 @@ static struct address_space_operations s
};

static struct backing_dev_info swap_backing_dev_info = {
- .memory_backed = 1, /* Does not contribute to dirty memory */
+ .capabilities = 0, /* Doesn't contribute to dirty memory */
.unplug_io_fn = swap_unplug_io_fn,
};

2005-03-07 15:22:47

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [PATCH 1/2] BDI: Provide backing device capability information

> The attached patch replaces backing_dev_info::memory_backed with
> capabilitied bitmap. The capabilities available include:

Wouldn't it be better to reverse the meaning of BDI_CAP_ACCOUNT_DIRTY
and BDI_CAP_WRITEBACK_DIRTY (BDI_CAP_NO_ACCOUNT_DIRTY...)? That way
out of tree filesystems that implicitly zero bdi->memory_backed
wouldn't _silently_ break. E.g. fuse does this, though it would not
actually break since it doesn't dirty any pages currently. I have no
idea whether there are other filesystems that are affected.

Miklos

2005-03-07 15:41:43

by David Howells

[permalink] [raw]
Subject: Re: [PATCH 1/2] BDI: Provide backing device capability information

Miklos Szeredi <[email protected]> wrote:

> > The attached patch replaces backing_dev_info::memory_backed with
> > capabilitied bitmap. The capabilities available include:
>
> Wouldn't it be better to reverse the meaning of BDI_CAP_ACCOUNT_DIRTY
> and BDI_CAP_WRITEBACK_DIRTY (BDI_CAP_NO_ACCOUNT_DIRTY...)? That way
> out of tree filesystems that implicitly zero bdi->memory_backed
> wouldn't _silently_ break. E.g. fuse does this, though it would not
> actually break since it doesn't dirty any pages currently. I have no
> idea whether there are other filesystems that are affected.

It shouldn't silently break... It will refuse to compile. I renamed
"memory_backed" to "capabilities".

David

2005-03-07 15:45:46

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [PATCH 1/2] BDI: Provide backing device capability information

> > Wouldn't it be better to reverse the meaning of BDI_CAP_ACCOUNT_DIRTY
> > and BDI_CAP_WRITEBACK_DIRTY (BDI_CAP_NO_ACCOUNT_DIRTY...)? That way
> > out of tree filesystems that implicitly zero bdi->memory_backed
> > wouldn't _silently_ break. E.g. fuse does this, though it would not
> > actually break since it doesn't dirty any pages currently. I have no
> > idea whether there are other filesystems that are affected.
>
> It shouldn't silently break... It will refuse to compile. I renamed
> "memory_backed" to "capabilities".

This will silently break:

static struct backing_dev_info my_bdi = {
.ra_pages = MY_RA,
.unplug_io_fn = default_unplug_io_fn,
};

Miklos

2005-03-07 16:07:06

by David Howells

[permalink] [raw]
Subject: Re: [PATCH 1/2] BDI: Provide backing device capability information

Miklos Szeredi <[email protected]> wrote:

> > It shouldn't silently break... It will refuse to compile. I renamed
> > "memory_backed" to "capabilities".
>
> This will silently break:
>
> static struct backing_dev_info my_bdi = {
> .ra_pages = MY_RA,
> .unplug_io_fn = default_unplug_io_fn,
> };

Sorry, yes. Obvious. Ugh. Andrew Morton suggested flipping the logic, and
although it was in conjunction with turning the concepts into bitfields, it
still stands here.

David

2005-03-07 16:15:39

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [PATCH 1/2] BDI: Provide backing device capability information

> Sorry, yes. Obvious. Ugh. Andrew Morton suggested flipping the logic, and
> although it was in conjunction with turning the concepts into bitfields, it
> still stands here.

OK, obviously Andrew has the final word in this. I just suggested
that it might be safer to have the logic flipped back.

Miklos

2005-03-07 21:25:40

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 1/2] BDI: Provide backing device capability information

Miklos Szeredi <[email protected]> wrote:
>
> > Sorry, yes. Obvious. Ugh. Andrew Morton suggested flipping the logic, and
> > although it was in conjunction with turning the concepts into bitfields, it
> > still stands here.
>
> OK, obviously Andrew has the final word in this. I just suggested
> that it might be safer to have the logic flipped back.
>

Experience indicates that it's safer to ignore anything I say on this topic.

Yeah, it would be better to not flip the logic.

2005-03-08 14:13:57

by David Howells

[permalink] [raw]
Subject: [PATCH 1/2] BDI: Provide backing device capability information [try #3]


The attached patch replaces backing_dev_info::memory_backed with capabilitied
bitmap. The capabilities available include:

(*) BDI_CAP_NO_ACCT_DIRTY

Set if the pages associated with this backing device should not be
tracked by the dirty page accounting.

(*) BDI_CAP_NO_WRITEBACK

Set if dirty pages associated with this backing device should not have
writepage() or writepages() invoked upon them to clean them.

(*) Capability markers that indicate what a backing device is capable of
with regard to memory mapping facilities. These flags indicate whether a
device can be mapped directly, whether it can be copied for a mapping,
and whether direct mappings can be read, written and/or executed. This
information is primarily aimed at improving no-MMU private mapping
support.

The patch also provides convenience functions for determining the dirty-page
capabilities available on backing devices directly or on the backing devices
associated with a mapping. These are provided to keep line length down when
checking for the capabilities.

Signed-Off-By: David Howells <[email protected]>
---
diff -uNrp /warthog/kernels/linux-2.6.11-rc4/include/linux/backing-dev.h linux-2.6.11-rc4-memback/include/linux/backing-dev.h
--- /warthog/kernels/linux-2.6.11-rc4/include/linux/backing-dev.h 2004-06-18 13:44:05.000000000 +0100
+++ linux-2.6.11-rc4-memback/include/linux/backing-dev.h 2005-03-08 11:03:45.000000000 +0000
@@ -25,13 +25,39 @@ typedef int (congested_fn)(void *, int);
struct backing_dev_info {
unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */
unsigned long state; /* Always use atomic bitops on this */
- int memory_backed; /* Cannot clean pages with writepage */
+ unsigned int capabilities; /* Device capabilities */
congested_fn *congested_fn; /* Function pointer if device is md/dm */
void *congested_data; /* Pointer to aux data for congested func */
void (*unplug_io_fn)(struct backing_dev_info *, struct page *);
void *unplug_io_data;
};

+
+/*
+ * Flags in backing_dev_info::capability
+ * - The first two flags control whether dirty pages will contribute to the
+ * VM's accounting and whether writepages() should be called for dirty pages
+ * (something that would not, for example, be appropriate for ramfs)
+ * - These flags let !MMU mmap() govern direct device mapping vs immediate
+ * copying more easily for MAP_PRIVATE, especially for ROM filesystems
+ */
+#define BDI_CAP_NO_ACCT_DIRTY 0x00000001 /* Dirty pages shouldn't contribute to accounting */
+#define BDI_CAP_NO_WRITEBACK 0x00000002 /* Don't write pages back */
+#define BDI_CAP_MAP_COPY 0x00000004 /* Copy can be mapped (MAP_PRIVATE) */
+#define BDI_CAP_MAP_DIRECT 0x00000008 /* Can be mapped directly (MAP_SHARED) */
+#define BDI_CAP_READ_MAP 0x00000010 /* Can be mapped for reading */
+#define BDI_CAP_WRITE_MAP 0x00000020 /* Can be mapped for writing */
+#define BDI_CAP_EXEC_MAP 0x00000040 /* Can be mapped for execution */
+#define BDI_CAP_VMFLAGS \
+ (BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP)
+
+#if defined(VM_MAYREAD) && \
+ (BDI_CAP_READ_MAP != VM_MAYREAD || \
+ BDI_CAP_WRITE_MAP != VM_MAYWRITE || \
+ BDI_CAP_EXEC_MAP != VM_MAYEXEC)
+#error please change backing_dev_info::capabilities flags
+#endif
+
extern struct backing_dev_info default_backing_dev_info;
void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page);

@@ -62,4 +88,17 @@ static inline int bdi_rw_congested(struc
(1 << BDI_write_congested));
}

+#define bdi_cap_writeback_dirty(bdi) \
+ (!((bdi)->capabilities & BDI_CAP_NO_WRITEBACK))
+
+#define bdi_cap_account_dirty(bdi) \
+ (!((bdi)->capabilities & BDI_CAP_NO_ACCT_DIRTY))
+
+#define mapping_cap_writeback_dirty(mapping) \
+ bdi_cap_writeback_dirty((mapping)->backing_dev_info)
+
+#define mapping_cap_account_dirty(mapping) \
+ bdi_cap_account_dirty((mapping)->backing_dev_info)
+
+
#endif /* _LINUX_BACKING_DEV_H */
diff -uNrp /warthog/kernels/linux-2.6.11-rc4/drivers/block/ll_rw_blk.c linux-2.6.11-rc4-memback/drivers/block/ll_rw_blk.c
--- /warthog/kernels/linux-2.6.11-rc4/drivers/block/ll_rw_blk.c 2005-02-14 12:18:23.000000000 +0000
+++ linux-2.6.11-rc4-memback/drivers/block/ll_rw_blk.c 2005-03-08 11:09:54.000000000 +0000
@@ -238,7 +238,7 @@ void blk_queue_make_request(request_queu
q->make_request_fn = mfn;
q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
q->backing_dev_info.state = 0;
- q->backing_dev_info.memory_backed = 0;
+ q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
blk_queue_max_sectors(q, MAX_SECTORS);
blk_queue_hardsect_size(q, 512);
blk_queue_dma_alignment(q, 511);
diff -uNrp /warthog/kernels/linux-2.6.11-rc4/drivers/block/rd.c linux-2.6.11-rc4-memback/drivers/block/rd.c
--- /warthog/kernels/linux-2.6.11-rc4/drivers/block/rd.c 2005-01-04 11:13:05.000000000 +0000
+++ linux-2.6.11-rc4-memback/drivers/block/rd.c 2005-03-08 11:14:00.000000000 +0000
@@ -325,7 +325,7 @@ static int rd_ioctl(struct inode *inode,
*/
static struct backing_dev_info rd_backing_dev_info = {
.ra_pages = 0, /* No readahead */
- .memory_backed = 1, /* Does not contribute to dirty memory */
+ .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK | BDI_CAP_MAP_COPY,
.unplug_io_fn = default_unplug_io_fn,
};

@@ -336,7 +336,7 @@ static struct backing_dev_info rd_backin
*/
static struct backing_dev_info rd_file_backing_dev_info = {
.ra_pages = 0, /* No readahead */
- .memory_backed = 0, /* Does contribute to dirty memory */
+ .capabilities = BDI_CAP_MAP_COPY, /* Does contribute to dirty memory */
.unplug_io_fn = default_unplug_io_fn,
};

diff -uNrp /warthog/kernels/linux-2.6.11-rc4/drivers/char/mem.c linux-2.6.11-rc4-memback/drivers/char/mem.c
--- /warthog/kernels/linux-2.6.11-rc4/drivers/char/mem.c 2005-01-04 11:13:10.000000000 +0000
+++ linux-2.6.11-rc4-memback/drivers/char/mem.c 2005-03-08 11:15:19.000000000 +0000
@@ -23,6 +23,7 @@
#include <linux/devfs_fs_kernel.h>
#include <linux/ptrace.h>
#include <linux/device.h>
+#include <linux/backing-dev.h>

#include <asm/uaccess.h>
#include <asm/io.h>
@@ -613,6 +614,10 @@ static struct file_operations zero_fops
.mmap = mmap_zero,
};

+static struct backing_dev_info zero_bdi = {
+ .capabilities = BDI_CAP_MAP_COPY,
+};
+
static struct file_operations full_fops = {
.llseek = full_lseek,
.read = read_full,
@@ -659,6 +664,7 @@ static int memory_open(struct inode * in
break;
#endif
case 5:
+ filp->f_mapping->backing_dev_info = &zero_bdi;
filp->f_op = &zero_fops;
break;
case 7:
diff -uNrp /warthog/kernels/linux-2.6.11-rc4/fs/buffer.c linux-2.6.11-rc4-memback/fs/buffer.c
--- /warthog/kernels/linux-2.6.11-rc4/fs/buffer.c 2005-02-14 12:18:50.000000000 +0000
+++ linux-2.6.11-rc4-memback/fs/buffer.c 2005-03-07 13:25:05.000000000 +0000
@@ -876,7 +876,7 @@ int __set_page_dirty_buffers(struct page
if (!TestSetPageDirty(page)) {
spin_lock_irq(&mapping->tree_lock);
if (page->mapping) { /* Race with truncate? */
- if (!mapping->backing_dev_info->memory_backed)
+ if (mapping_cap_account_dirty(mapping))
inc_page_state(nr_dirty);
radix_tree_tag_set(&mapping->page_tree,
page_index(page),
diff -uNrp /warthog/kernels/linux-2.6.11-rc4/fs/fs-writeback.c linux-2.6.11-rc4-memback/fs/fs-writeback.c
--- /warthog/kernels/linux-2.6.11-rc4/fs/fs-writeback.c 2005-02-14 12:18:51.000000000 +0000
+++ linux-2.6.11-rc4-memback/fs/fs-writeback.c 2005-03-07 13:27:24.000000000 +0000
@@ -315,7 +315,7 @@ sync_sb_inodes(struct super_block *sb, s
struct backing_dev_info *bdi = mapping->backing_dev_info;
long pages_skipped;

- if (bdi->memory_backed) {
+ if (!bdi_cap_writeback_dirty(bdi)) {
list_move(&inode->i_list, &sb->s_dirty);
if (sb == blockdev_superblock) {
/*
@@ -566,7 +566,7 @@ int write_inode_now(struct inode *inode,
.sync_mode = WB_SYNC_ALL,
};

- if (inode->i_mapping->backing_dev_info->memory_backed)
+ if (!mapping_cap_writeback_dirty(inode->i_mapping))
return 0;

might_sleep();
diff -uNrp /warthog/kernels/linux-2.6.11-rc4/fs/hugetlbfs/inode.c linux-2.6.11-rc4-memback/fs/hugetlbfs/inode.c
--- /warthog/kernels/linux-2.6.11-rc4/fs/hugetlbfs/inode.c 2005-02-14 12:18:51.000000000 +0000
+++ linux-2.6.11-rc4-memback/fs/hugetlbfs/inode.c 2005-03-08 11:09:21.000000000 +0000
@@ -40,7 +40,7 @@ static struct inode_operations hugetlbfs

static struct backing_dev_info hugetlbfs_backing_dev_info = {
.ra_pages = 0, /* No readahead */
- .memory_backed = 1, /* Does not contribute to dirty memory */
+ .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
};

int sysctl_hugetlb_shm_group;
diff -uNrp /warthog/kernels/linux-2.6.11-rc4/fs/ramfs/inode.c linux-2.6.11-rc4-memback/fs/ramfs/inode.c
--- /warthog/kernels/linux-2.6.11-rc4/fs/ramfs/inode.c 2005-02-14 12:18:53.000000000 +0000
+++ linux-2.6.11-rc4-memback/fs/ramfs/inode.c 2005-03-08 11:08:09.000000000 +0000
@@ -45,7 +45,9 @@ static struct inode_operations ramfs_dir

static struct backing_dev_info ramfs_backing_dev_info = {
.ra_pages = 0, /* No readahead */
- .memory_backed = 1, /* Does not contribute to dirty memory */
+ .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK |
+ BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY |
+ BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP,
};

struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
diff -uNrp /warthog/kernels/linux-2.6.11-rc4/fs/sysfs/inode.c linux-2.6.11-rc4-memback/fs/sysfs/inode.c
--- /warthog/kernels/linux-2.6.11-rc4/fs/sysfs/inode.c 2005-01-04 11:13:43.000000000 +0000
+++ linux-2.6.11-rc4-memback/fs/sysfs/inode.c 2005-03-08 11:12:24.000000000 +0000
@@ -23,7 +23,7 @@ static struct address_space_operations s

static struct backing_dev_info sysfs_backing_dev_info = {
.ra_pages = 0, /* No readahead */
- .memory_backed = 1, /* Does not contribute to dirty memory */
+ .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
};

struct inode * sysfs_new_inode(mode_t mode)
diff -uNrp /warthog/kernels/linux-2.6.11-rc4/mm/filemap.c linux-2.6.11-rc4-memback/mm/filemap.c
--- /warthog/kernels/linux-2.6.11-rc4/mm/filemap.c 2005-02-14 12:19:04.000000000 +0000
+++ linux-2.6.11-rc4-memback/mm/filemap.c 2005-03-07 13:25:50.000000000 +0000
@@ -172,7 +172,7 @@ static int __filemap_fdatawrite_range(st
.end = end,
};

- if (mapping->backing_dev_info->memory_backed)
+ if (!mapping_cap_writeback_dirty(mapping))
return 0;

ret = do_writepages(mapping, &wbc);
@@ -269,7 +269,7 @@ int sync_page_range(struct inode *inode,
pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
int ret;

- if (mapping->backing_dev_info->memory_backed || !count)
+ if (!mapping_cap_writeback_dirty(mapping) || !count)
return 0;
ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
if (ret == 0) {
@@ -295,7 +295,7 @@ int sync_page_range_nolock(struct inode
pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
int ret;

- if (mapping->backing_dev_info->memory_backed || !count)
+ if (!mapping_cap_writeback_dirty(mapping) || !count)
return 0;
ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
if (ret == 0)
diff -uNrp /warthog/kernels/linux-2.6.11-rc4/mm/page-writeback.c linux-2.6.11-rc4-memback/mm/page-writeback.c
--- /warthog/kernels/linux-2.6.11-rc4/mm/page-writeback.c 2005-02-14 12:19:04.000000000 +0000
+++ linux-2.6.11-rc4-memback/mm/page-writeback.c 2005-03-07 13:26:25.000000000 +0000
@@ -605,7 +605,7 @@ int __set_page_dirty_nobuffers(struct pa
mapping2 = page_mapping(page);
if (mapping2) { /* Race with truncate? */
BUG_ON(mapping2 != mapping);
- if (!mapping->backing_dev_info->memory_backed)
+ if (mapping_cap_account_dirty(mapping))
inc_page_state(nr_dirty);
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
@@ -691,7 +691,7 @@ int test_clear_page_dirty(struct page *p
page_index(page),
PAGECACHE_TAG_DIRTY);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- if (!mapping->backing_dev_info->memory_backed)
+ if (mapping_cap_account_dirty(mapping))
dec_page_state(nr_dirty);
return 1;
}
@@ -722,7 +722,7 @@ int clear_page_dirty_for_io(struct page

if (mapping) {
if (TestClearPageDirty(page)) {
- if (!mapping->backing_dev_info->memory_backed)
+ if (mapping_cap_account_dirty(mapping))
dec_page_state(nr_dirty);
return 1;
}
diff -uNrp /warthog/kernels/linux-2.6.11-rc4/mm/readahead.c linux-2.6.11-rc4-memback/mm/readahead.c
--- /warthog/kernels/linux-2.6.11-rc4/mm/readahead.c 2005-02-14 12:19:04.000000000 +0000
+++ linux-2.6.11-rc4-memback/mm/readahead.c 2005-03-08 11:05:04.000000000 +0000
@@ -23,6 +23,7 @@ EXPORT_SYMBOL(default_unplug_io_fn);
struct backing_dev_info default_backing_dev_info = {
.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE,
.state = 0,
+ .capabilities = BDI_CAP_MAP_COPY,
.unplug_io_fn = default_unplug_io_fn,
};
EXPORT_SYMBOL_GPL(default_backing_dev_info);
diff -uNrp /warthog/kernels/linux-2.6.11-rc4/mm/shmem.c linux-2.6.11-rc4-memback/mm/shmem.c
--- /warthog/kernels/linux-2.6.11-rc4/mm/shmem.c 2005-02-14 12:19:04.000000000 +0000
+++ linux-2.6.11-rc4-memback/mm/shmem.c 2005-03-08 11:34:52.000000000 +0000
@@ -184,8 +184,8 @@ static struct vm_operations_struct shmem

static struct backing_dev_info shmem_backing_dev_info = {
.ra_pages = 0, /* No readahead */
- .memory_backed = 1, /* Does not contribute to dirty memory */
- .unplug_io_fn = default_unplug_io_fn,
+ .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+ .unplug_io_fn = default_unplug_io_fn,
};

static LIST_HEAD(shmem_swaplist);
diff -uNrp /warthog/kernels/linux-2.6.11-rc4/mm/swap_state.c linux-2.6.11-rc4-memback/mm/swap_state.c
--- /warthog/kernels/linux-2.6.11-rc4/mm/swap_state.c 2005-02-14 12:19:04.000000000 +0000
+++ linux-2.6.11-rc4-memback/mm/swap_state.c 2005-03-08 11:07:41.000000000 +0000
@@ -29,7 +29,7 @@ static struct address_space_operations s
};

static struct backing_dev_info swap_backing_dev_info = {
- .memory_backed = 1, /* Does not contribute to dirty memory */
+ .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
.unplug_io_fn = swap_unplug_io_fn,
};

2005-03-09 00:44:14

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 1/2] BDI: Provide backing device capability information [try #3]

David Howells <[email protected]> wrote:
>
> The attached patch replaces backing_dev_info::memory_backed with capabilitied
> bitmap.

Looks sane to me, thanks.

I hope you got all the conversions correct - breakage in the writeback
dirty accounting manifests in subtle ways. I'll double-check it.

2005-03-09 10:28:59

by David Howells

[permalink] [raw]
Subject: Re: [PATCH 1/2] BDI: Provide backing device capability information [try #3]


Andrew Morton <[email protected]> wrote:

> > The attached patch replaces backing_dev_info::memory_backed with
> > capabilitied bitmap.
>
> Looks sane to me, thanks.
>
> I hope you got all the conversions correct - breakage in the writeback
> dirty accounting manifests in subtle ways. I'll double-check it.

I think I got the logic as-was. This is quite easy to check just by looking at
the patch. However, the as-was logic should possibly be changed to reflect the
fact that the one control that there was has now been split into two.

David