2008-10-29 01:53:20

by Phillip Lougher

[permalink] [raw]
Subject: [PATCH V2 10/16] Squashfs: cache operations


Signed-off-by: Phillip Lougher <[email protected]>
---
fs/squashfs/cache.c | 315 +++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 315 insertions(+), 0 deletions(-)

diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
new file mode 100644
index 0000000..d121f31
--- /dev/null
+++ b/fs/squashfs/cache.c
@@ -0,0 +1,315 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * cache.c
+ */
+
+/*
+ * Blocks in Squashfs are compressed. To avoid repeatedly decompressing
+ * recently accessed data Squashfs uses two small metadata and fragment caches.
+ *
+ * This file implements a generic cache implementation used for both caches,
+ * plus functions layered ontop of the generic cache implementation to
+ * access the metadata and fragment caches.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/wait.h>
+#include <linux/zlib.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+/*
+ * Look-up block in cache, and increment usage count. If not in cache, read
+ * and decompress it from disk.
+ */
+struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb,
+ struct squashfs_cache *cache, long long block, int length)
+{
+ int i, n;
+ struct squashfs_cache_entry *entry;
+
+ spin_lock(&cache->lock);
+
+ while (1) {
+ for (i = 0; i < cache->entries; i++)
+ if (cache->entry[i].block == block)
+ break;
+
+ if (i == cache->entries) {
+ /*
+ * Block not in cache, if all cache entries are locked
+ * go to sleep waiting for one to become available.
+ */
+ if (cache->unused == 0) {
+ cache->waiting++;
+ spin_unlock(&cache->lock);
+ wait_event(cache->wait_queue, cache->unused);
+ spin_lock(&cache->lock);
+ cache->waiting--;
+ continue;
+ }
+
+ /*
+ * At least one unlocked cache entry. A simple
+ * round-robin strategy is used to choose the entry to
+ * be evicted from the cache.
+ */
+ i = cache->next_blk;
+ for (n = 0; n < cache->entries; n++) {
+ if (cache->entry[i].locked == 0)
+ break;
+ i = (i + 1) % cache->entries;
+ }
+
+ cache->next_blk = (i + 1) % cache->entries;
+ entry = &cache->entry[i];
+
+ /*
+ * Initialise choosen cache entry, and fill it in from
+ * disk.
+ */
+ cache->unused--;
+ entry->block = block;
+ entry->locked = 1;
+ entry->pending = 1;
+ entry->waiting = 0;
+ entry->error = 0;
+ spin_unlock(&cache->lock);
+
+ entry->length = squashfs_read_data(sb, entry->data,
+ block, length, &entry->next_index,
+ cache->block_size);
+
+ spin_lock(&cache->lock);
+
+ if (entry->length < 0)
+ entry->error = entry->length;
+
+ entry->pending = 0;
+ spin_unlock(&cache->lock);
+
+ /*
+ * While filling this entry one or more other processes
+ * have looked it up in the cache, and have slept
+ * waiting for it to become available.
+ */
+ if (entry->waiting)
+ wake_up_all(&entry->wait_queue);
+ goto out;
+ }
+
+ /*
+ * Block already in cache. Increment lock so it doesn't
+ * get reused until we're finished with it, if it was
+ * previously unlocked there's one less cache entry available
+ * for reuse.
+ */
+ entry = &cache->entry[i];
+ if (entry->locked == 0)
+ cache->unused--;
+ entry->locked++;
+
+ /*
+ * If the entry is currently being filled in by another process
+ * go to sleep waiting for it to become available.
+ */
+ if (entry->pending) {
+ entry->waiting++;
+ spin_unlock(&cache->lock);
+ wait_event(entry->wait_queue, !entry->pending);
+ goto out;
+ }
+
+ spin_unlock(&cache->lock);
+ goto out;
+ }
+
+out:
+ TRACE("Got %s %d, start block %lld, locked %d, error %d\n", cache->name,
+ i, entry->block, entry->locked, entry->error);
+
+ if (entry->error)
+ ERROR("Unable to read %s cache entry [%llx]\n", cache->name,
+ block);
+ return entry;
+}
+
+
+/*
+ * Release block, once usage count is zero it can be reused.
+ */
+void squashfs_cache_put(struct squashfs_cache *cache,
+ struct squashfs_cache_entry *entry)
+{
+ spin_lock(&cache->lock);
+ entry->locked--;
+ if (entry->locked == 0) {
+ cache->unused++;
+ spin_unlock(&cache->lock);
+ /*
+ * If there's any processes waiting for a block to become
+ * available, wake one up.
+ */
+ if (cache->waiting)
+ wake_up(&cache->wait_queue);
+ } else {
+ spin_unlock(&cache->lock);
+ }
+}
+
+
+void squashfs_cache_delete(struct squashfs_cache *cache)
+{
+ int i;
+
+ if (cache == NULL)
+ return;
+
+ for (i = 0; i < cache->entries; i++)
+ if (cache->entry[i].data) {
+ if (cache->use_vmalloc)
+ vfree(cache->entry[i].data);
+ else
+ kfree(cache->entry[i].data);
+ }
+
+ kfree(cache);
+}
+
+
+struct squashfs_cache *squashfs_cache_init(char *name, int entries,
+ int block_size, int use_vmalloc)
+{
+ int i;
+ struct squashfs_cache *cache = kzalloc(sizeof(*cache) + entries *
+ sizeof(*(cache->entry)), GFP_KERNEL);
+
+ if (cache == NULL) {
+ ERROR("Failed to allocate %s cache\n", name);
+ return NULL;
+ }
+
+ cache->next_blk = 0;
+ cache->unused = entries;
+ cache->entries = entries;
+ cache->block_size = block_size;
+ cache->use_vmalloc = use_vmalloc;
+ cache->name = name;
+ cache->waiting = 0;
+ spin_lock_init(&cache->lock);
+ init_waitqueue_head(&cache->wait_queue);
+
+ for (i = 0; i < entries; i++) {
+ init_waitqueue_head(&cache->entry[i].wait_queue);
+ cache->entry[i].block = SQUASHFS_INVALID_BLK;
+ cache->entry[i].data = use_vmalloc ? vmalloc(block_size) :
+ kmalloc(block_size, GFP_KERNEL);
+ if (cache->entry[i].data == NULL) {
+ ERROR("Failed to allocate %s cache entry\n", name);
+ goto cleanup;
+ }
+ }
+
+ return cache;
+
+cleanup:
+ squashfs_cache_delete(cache);
+ return NULL;
+}
+
+
+/*
+ * Read length bytes from metadata position <block, offset> (block is the
+ * start of the compressed block on disk, and offset is the offset into
+ * the block once decompressed). Data is packed into consecutive blocks,
+ * and length bytes may require reading more than one block.
+ */
+int squashfs_read_metadata(struct super_block *sb, void *buffer,
+ long long *block, int *offset, int length)
+{
+ struct squashfs_sb_info *msblk = sb->s_fs_info;
+ int bytes, return_length = length;
+ struct squashfs_cache_entry *entry;
+
+ TRACE("Entered squashfs_read_metadata [%llx:%x]\n", *block, *offset);
+
+ while (1) {
+ entry = squashfs_cache_get(sb, msblk->block_cache, *block, 0);
+ bytes = entry->length - *offset;
+
+ if (entry->error) {
+ return_length = entry->error;
+ goto finish;
+ } else if (bytes < 1) {
+ return_length = -EIO;
+ goto finish;
+ } else if (bytes >= length) {
+ if (buffer)
+ memcpy(buffer, entry->data + *offset, length);
+ if (entry->length - *offset == length) {
+ *block = entry->next_index;
+ *offset = 0;
+ } else {
+ *offset += length;
+ }
+ goto finish;
+ } else {
+ if (buffer) {
+ memcpy(buffer, entry->data + *offset, bytes);
+ buffer += bytes;
+ }
+ *block = entry->next_index;
+ squashfs_cache_put(msblk->block_cache, entry);
+ length -= bytes;
+ *offset = 0;
+ }
+ }
+
+finish:
+ squashfs_cache_put(msblk->block_cache, entry);
+ return return_length;
+}
+
+
+struct squashfs_cache_entry *get_cached_fragment(struct super_block *sb,
+ long long start_block, int length)
+{
+ struct squashfs_sb_info *msblk = sb->s_fs_info;
+
+ return squashfs_cache_get(sb, msblk->fragment_cache, start_block,
+ length);
+}
+
+
+void release_cached_fragment(struct squashfs_sb_info *msblk,
+ struct squashfs_cache_entry *fragment)
+{
+ squashfs_cache_put(msblk->fragment_cache, fragment);
+}
+
--
1.5.2.5


2008-10-29 09:33:12

by Jörn Engel

[permalink] [raw]
Subject: Re: [PATCH V2 10/16] Squashfs: cache operations

On Wed, 29 October 2008 01:49:56 +0000, Phillip Lougher wrote:
> +/*
> + * Blocks in Squashfs are compressed. To avoid repeatedly decompressing
> + * recently accessed data Squashfs uses two small metadata and fragment caches.
> + *
> + * This file implements a generic cache implementation used for both caches,
> + * plus functions layered ontop of the generic cache implementation to
> + * access the metadata and fragment caches.
> + */

I tend to agree with Andrew that a lot of this should be done by the
page cache instead. One of the problems seems to be that your blocksize
can exceed page size and there really isn't any infrastructure to deal
with such cases yet. Bufferheads deal with blocks smaller than a page,
not the other way around.

Another is that address spaces are limited ot 16TB on 32bit
architectures. I guess that should be good enough for a while. I've
heard some rumors that btrfs actually uses multiple address spaces to
handle this problem, so a good strategy may be to sit back, wait and
simply copy what btrfs does once the issue becomes pressing.

To deal with large blocks, you most likely want to keep you struct
squashfs_cache_entry around and have page->private point to it. But be
warned, as the whole page->private business is - shall we say - fragile.
You need to supply a number of methods to make things work. And if you
fail to set any one of them, core code will assume a default, which is
to call into the bufferhead code. And the backtrace you will receive
some time later has little or no indication that you actually only
missed one method. I've been meaning to clean this up, but never found
the courage to actually do it.

> +/*
> + * Look-up block in cache, and increment usage count. If not in cache, read
> + * and decompress it from disk.
> + */
> +struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb,
> + struct squashfs_cache *cache, long long block, int length)

I personally prefer u64 instead of long long. It is a device address
for a 64bit filesystem after all. Same for next_index.

> + if (i == cache->entries) {
> + /*
> + * Block not in cache, if all cache entries are locked
> + * go to sleep waiting for one to become available.
> + */
> + if (cache->unused == 0) {
> + cache->waiting++;
> + spin_unlock(&cache->lock);
> + wait_event(cache->wait_queue, cache->unused);
> + spin_lock(&cache->lock);
> + cache->waiting--;

Maybe rename to no_waiters? "waiting" looks more like a boolean.

> + entry->length = squashfs_read_data(sb, entry->data,
> + block, length, &entry->next_index,
> + cache->block_size);
> +
> + spin_lock(&cache->lock);
> +
> + if (entry->length < 0)
> + entry->error = entry->length;

entry->error is of type char. We actually have errno's defined up to
131, so if by whatever freak chance the error is -ENOTRECOVERABLE, this
will convert it to a positive number. I wouldn't want to debug that.

> +void squashfs_cache_put(struct squashfs_cache *cache,
> + struct squashfs_cache_entry *entry)
> +{
> + spin_lock(&cache->lock);
> + entry->locked--;
> + if (entry->locked == 0) {

You might want to rename this to "refcount", just to make the name match
the behaviour.

Jörn

--
Measure. Don't tune for speed until you've measured, and even then
don't unless one part of the code overwhelms the rest.
-- Rob Pike

2008-10-31 04:43:56

by Phillip Lougher

[permalink] [raw]
Subject: Re: [PATCH V2 10/16] Squashfs: cache operations

Jörn Engel wrote:
> On Wed, 29 October 2008 01:49:56 +0000, Phillip Lougher wrote:
>> +/*
>> + * Blocks in Squashfs are compressed. To avoid repeatedly decompressing
>> + * recently accessed data Squashfs uses two small metadata and fragment caches.
>> + *
>> + * This file implements a generic cache implementation used for both caches,
>> + * plus functions layered ontop of the generic cache implementation to
>> + * access the metadata and fragment caches.
>> + */
>
> I tend to agree with Andrew that a lot of this should be done by the
> page cache instead. One of the problems seems to be that your blocksize
> can exceed page size and there really isn't any infrastructure to deal
> with such cases yet. Bufferheads deal with blocks smaller than a page,
> not the other way around.
>

I thought about using the page cache, but, the fact that blocksizes
exceed the page cache size is only one of a number of reasons why I
prefer the current solution, there is also simplicity and speed to consider.

There are three types of compressed block in Squashfs: datablocks,
fragments, and metadata blocks. Of these datablocks (by far the largest
number of blocks) are decompressed and pushed into the page cache, and
are not otherwise cached by Squashfs. This, obviously (?), is because
they contain data for only one file, and so at time of access there is a
readily available address space to push the data into.

Metadata and fragment blocks are different in that when accessed and
decompressed (say for an inode or for a particular tail-end block) they
will contain metadata or tail-ends for other files. This data could be
thrown away but due to locality of reference it makes sense to
temporarily cache it in-case a near future file access references the
data. But it doesn't make much sense to cache it more than temporarily,
much of the data will probably not be reused, and it exists compressed
in the buffer cache.

The squashfs cache is therefore designed to cache only the last couple
of metadata and fragment blocks. It is also designed to be simple and
extremely fast. The maximum size of the metadata cache is only 64 KiB.

Simplicity and speed is extremely important. The
squashfs_metadata_read() wrapper around the cache is designed to step
through the metadata a structure at a time (20 or so bytes), updating
the read position in the metadata each call, with more metadata cache
blocks being read and decompressed as necessary. The common case where
the metadata is already in the cache (because we're stepping through it
20 or so bytes at a time), is designed to be extremely fast - a spinlock
and array search only. I recently optimised the cache to use spinlocks
rather than mutexes and reduced the lock holding time (necessary to move
to spinlocks), and this resulted in a 20%+ speed improvement in reading
squashfs filesystems.

Given the above using an address space in the page cache will result in
greater complexity, more memory overhead, and much slower operation.
There's a number of reasons for this.

1. The amount and life-span of the data stored in the page cache is
outside of Squashfs' control. As explained above it only makes sense to
temporarily cache the last couple of metadata and fragment blocks.
Using the page cache (if a 'large' address space is used) for these
keeps more of them around for longer than necessary, and will
potentially cause more worthy datablocks to be flushed on memory pressure.

2. The address space will be caching uncompressed data, the squashfs
references to this data are the compressed locations within the
filesystem. There doesn't exist a one-to-one linear mapping from
compressed location to decompressed location in the address space. This
means a lookup table still needs to be used to store the mapping from
compressed location to decompressed location in the address space. Now
this lookup table (however implemented) is itself at least as complex as
my current cache implementation.

3. Once the locations of the decompressed pages in the address space
have been found, they'll need to be looked up in the page cache, and
this has to be done for every 4K page. With the default fragment size
of 128 KiB this means 32 separate lookups. Somewhat slower than one
spinlock and array search per 128 KiB block in the squashfs cache
implementation.

Comments, especially those of the form "you've got this completely
wrong, and you can use the page cache like this, which will be simpler
and faster than your current implementation" welcome :) I'm not adverse
to using the page cache, but I can't see how it will be simpler or
faster than the current implementation.

Phillip

2008-10-31 07:29:47

by Jörn Engel

[permalink] [raw]
Subject: Re: [PATCH V2 10/16] Squashfs: cache operations

On Fri, 31 October 2008 04:43:46 +0000, Phillip Lougher wrote:
>
> Simplicity and speed is extremely important. The
> squashfs_metadata_read() wrapper around the cache is designed to step
> through the metadata a structure at a time (20 or so bytes), updating
> the read position in the metadata each call, with more metadata cache
> blocks being read and decompressed as necessary. The common case where
> the metadata is already in the cache (because we're stepping through it
> 20 or so bytes at a time), is designed to be extremely fast - a spinlock
> and array search only. I recently optimised the cache to use spinlocks
> rather than mutexes and reduced the lock holding time (necessary to move
> to spinlocks), and this resulted in a 20%+ speed improvement in reading
> squashfs filesystems.

For the page cache, you can use read_cache_page() and
page_cache_release(). This does not even take a spinlock, hence
multiple readers can work in parallel. The radix tree walk will take a
similar time to your array search.

You are aware that multiple cpus will play pingpong with your spinlock?

> Given the above using an address space in the page cache will result in
> greater complexity, more memory overhead, and much slower operation.
> There's a number of reasons for this.
>
> 1. The amount and life-span of the data stored in the page cache is
> outside of Squashfs' control. As explained above it only makes sense to
> temporarily cache the last couple of metadata and fragment blocks.
> Using the page cache (if a 'large' address space is used) for these
> keeps more of them around for longer than necessary, and will
> potentially cause more worthy datablocks to be flushed on memory pressure.

I personally find it hard to guess access patterns. If I constructed a
workload just large enough that your cache is slightly too small, it
will start thrashing. Hit rates will go close to zero. And unless I
missed something, such a workload can be constructed and may even be
used in real life. With growing numbers of cores, this becomes
increasingly likely.

In other cases, an idle squashfs will still hold the 64k of unused cache
for ransom. By giving up control, you allow your cache to grow and
shrink as desired and can avoid both cases.

> 2. The address space will be caching uncompressed data, the squashfs
> references to this data are the compressed locations within the
> filesystem. There doesn't exist a one-to-one linear mapping from
> compressed location to decompressed location in the address space. This
> means a lookup table still needs to be used to store the mapping from
> compressed location to decompressed location in the address space. Now
> this lookup table (however implemented) is itself at least as complex as
> my current cache implementation.

You are currently using physical offset of the medium to address blocks
in the cache, which are 64bit. Page cache may use 32bit to address
pages. And since blocks can be larger than pages, you effectively need
some bits extra. This is indeed a problem.

> 3. Once the locations of the decompressed pages in the address space
> have been found, they'll need to be looked up in the page cache, and
> this has to be done for every 4K page. With the default fragment size
> of 128 KiB this means 32 separate lookups. Somewhat slower than one
> spinlock and array search per 128 KiB block in the squashfs cache
> implementation.

Above you claimed the complete cache to be just 64k in size. How can
you access 128k blocks that way? One of the numbers appears wrong.

Ignoring that, you don't need to take either the spinlock or do a lookup
in a fast path. If you currently have this code:

for (some condition) {
err = squashfs_read_metadata(address, ...);
}

You can transform it to:

void *handle = squashfs_get_metadata(address, ...);
for (some condition) {
err = squashfs_read_metadata(handle, address, ...);
}
squashfs_put_metadata(handle);

With the handle you can keep a reference count to your cached object.
squashfs_read_metadata() only has to call squashfs_cache_get() or
squashfs_cache_put() when moving across object boundaries. In the
common case it simply returns data from the object referenced through
the handle.

This might be a worthwhile optimization independently of whether you use
the page cache or not.

> Comments, especially those of the form "you've got this completely
> wrong, and you can use the page cache like this, which will be simpler
> and faster than your current implementation" welcome :) I'm not adverse
> to using the page cache, but I can't see how it will be simpler or
> faster than the current implementation.

Only one of your problems seems to be real. Not sure if or how we can
solve that one, though.

Jörn

--
"Security vulnerabilities are here to stay."
-- Scott Culp, Manager of the Microsoft Security Response Center, 2001

2008-10-31 15:03:39

by Phillip Lougher

[permalink] [raw]
Subject: Re: [PATCH V2 10/16] Squashfs: cache operations

Jörn Engel wrote:

>
> Only one of your problems seems to be real. Not sure if or how we can
> solve that one, though.
>

Sorry don't agree. But I'm not going to argue this like a couple of old
maids.

I'll keep what I currently do unless Andrew Morton or someone else says
it's not getting mainlined unless it's changed.

Phillip

2008-11-03 14:11:22

by Evgeniy Polyakov

[permalink] [raw]
Subject: Re: [PATCH V2 10/16] Squashfs: cache operations

Hi.

Couple comments below.

On Wed, Oct 29, 2008 at 01:49:56AM +0000, Phillip Lougher ([email protected]) wrote:
> +struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb,
> + struct squashfs_cache *cache, long long block, int length)
> +{
> + int i, n;
> + struct squashfs_cache_entry *entry;
> +
> + spin_lock(&cache->lock);
> +
> + while (1) {
> + for (i = 0; i < cache->entries; i++)
> + if (cache->entry[i].block == block)
> + break;
> +
> + if (i == cache->entries) {
> + /*
> + * Block not in cache, if all cache entries are locked
> + * go to sleep waiting for one to become available.
> + */
> + if (cache->unused == 0) {
> + cache->waiting++;
> + spin_unlock(&cache->lock);
> + wait_event(cache->wait_queue, cache->unused);
> + spin_lock(&cache->lock);
> + cache->waiting--;
> + continue;
> + }
> +
> + /*
> + * At least one unlocked cache entry. A simple
> + * round-robin strategy is used to choose the entry to
> + * be evicted from the cache.
> + */
> + i = cache->next_blk;
> + for (n = 0; n < cache->entries; n++) {
> + if (cache->entry[i].locked == 0)
> + break;
> + i = (i + 1) % cache->entries;
> + }
> +
> + cache->next_blk = (i + 1) % cache->entries;
> + entry = &cache->entry[i];

This is invoked for every read when cache is filled, if I understood
correctly, and having a modulo in this path is an additional overhead.
This may be hidden on behalf of compression overhead, but stil.

Also what happens when there are no unlocked entries? I.e. you will try
to work with existing one, while it is already locked and processed by
another thread?


--
Evgeniy Polyakov