2008-03-05 22:58:12

by Joel Becker

[permalink] [raw]
Subject: [PATCH 0/7] ocfs2: Extended slot map

ocfs2 has a system file called "slot_map". A "slot" is a collection of
files local to particular mounted node, including the journal and
allocators that node is using. The slot map converts the slot number to
a node number, so when a node dies, ocfs2 knows which slot to recover.

The old ocfs2 slot map is a very limited. It has a physical maximum of
254 entries - specifically, it must fit within one disk block. It only
allows node numbers up to 254, and cannot be extended past INT16_MAX
(32767). This is a problem in the world of userspace cluster stacks,
where the node numbers are often sparse and can be up to UINT32_MAX.

It also has the structural problem that empty slots are signified by a
magic number. That number happens to be -1 (0xFFFF). It makes for code
that isn't as obvious as one would like.

Thus, we introduce a new slot map format, referred to hence as the
"extended slot map". The extended slot map is allocated as regular file
space, and so is bound by i_size. The new format adds a "valid" field,
distinct from the node number. Finally, it has room for extension
should it be needed.

The kernel code is available on the 'new-slot-map' branch of my git
repository.

View:
http://oss.oracle.com/git/?p=jlbec/linux-2.6.git;a=shortlog;h=new-slot-map
Pull:
git pull git://oss.oracle.com/git/jlbec/linux-2.6.git new-slot-map

The tools code is also available via git, in the 'new-slot-map' branch
as well.

View:
http://oss.oracle.com/git/?p=ocfs2-tools.git;a=shortlog;h=new-slot-map
Pull:
git pull git://oss.oracle.com/git/ocfs2-tools.git new-slot-map



2008-03-05 22:56:08

by Joel Becker

[permalink] [raw]
Subject: [PATCH 2/7] ocfs2: Make ocfs2_slot_info private.

Just use osb_lock around the ocfs2_slot_info data. This allows us to
take the ocfs2_slot_info structure private in slot_info.c. All access
is now via accessors.

Signed-off-by: Joel Becker <[email protected]>
---
fs/ocfs2/journal.c | 24 +++++++-------
fs/ocfs2/ocfs2.h | 1 +
fs/ocfs2/slot_map.c | 81 ++++++++++++++++++++++++++++++++++++---------------
fs/ocfs2/slot_map.h | 25 ++-------------
4 files changed, 74 insertions(+), 57 deletions(-)

diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index c2e654e..ed0c6d0 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1079,7 +1079,6 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
{
int status = 0;
int slot_num;
- struct ocfs2_slot_info *si = osb->slot_info;
struct ocfs2_dinode *la_copy = NULL;
struct ocfs2_dinode *tl_copy = NULL;

@@ -1092,8 +1091,8 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
* case we should've called ocfs2_journal_load instead. */
BUG_ON(osb->node_num == node_num);

- slot_num = ocfs2_node_num_to_slot(si, node_num);
- if (slot_num == OCFS2_INVALID_SLOT) {
+ slot_num = ocfs2_node_num_to_slot(osb, node_num);
+ if (slot_num == -ENOENT) {
status = 0;
mlog(0, "no slot for this node, so no recovery required.\n");
goto done;
@@ -1183,23 +1182,24 @@ bail:
* slot info struct has been updated from disk. */
int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
{
- int status, i, node_num;
- struct ocfs2_slot_info *si = osb->slot_info;
+ unsigned int node_num;
+ int status, i;

/* This is called with the super block cluster lock, so we
* know that the slot map can't change underneath us. */

- spin_lock(&si->si_lock);
- for(i = 0; i < si->si_num_slots; i++) {
+ spin_lock(&osb->osb_lock);
+ for (i = 0; i < osb->max_slots; i++) {
if (i == osb->slot_num)
continue;
- if (ocfs2_is_empty_slot(si, i))
+
+ status = ocfs2_slot_to_node_num_locked(osb, i, &node_num);
+ if (status == -ENOENT)
continue;

- node_num = si->si_global_node_nums[i];
if (ocfs2_node_map_test_bit(osb, &osb->recovery_map, node_num))
continue;
- spin_unlock(&si->si_lock);
+ spin_unlock(&osb->osb_lock);

/* Ok, we have a slot occupied by another node which
* is not in the recovery map. We trylock his journal
@@ -1215,9 +1215,9 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
goto bail;
}

- spin_lock(&si->si_lock);
+ spin_lock(&osb->osb_lock);
}
- spin_unlock(&si->si_lock);
+ spin_unlock(&osb->osb_lock);

status = 0;
bail:
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 6546cef..ee3f675 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -179,6 +179,7 @@ enum ocfs2_mount_options
#define OCFS2_DEFAULT_ATIME_QUANTUM 60

struct ocfs2_journal;
+struct ocfs2_slot_info;
struct ocfs2_super
{
struct task_struct *commit_task;
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index f5727b8..762360d 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -42,13 +42,25 @@

#include "buffer_head_io.h"

+struct ocfs2_slot_info {
+ struct inode *si_inode;
+ struct buffer_head *si_bh;
+ unsigned int si_num_slots;
+ unsigned int si_size;
+ s16 si_global_node_nums[OCFS2_MAX_SLOTS];
+};
+
+
static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
s16 global);
static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
s16 slot_num,
s16 node_num);

-/* post the slot information on disk into our slot_info struct. */
+/*
+ * Post the slot information on disk into our slot_info struct.
+ * Must be protected by osb_lock.
+ */
static void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
{
int i;
@@ -56,13 +68,10 @@ static void ocfs2_update_slot_info(struct ocfs2_slot_info *si)

/* we don't read the slot block here as ocfs2_super_lock
* should've made sure we have the most recent copy. */
- spin_lock(&si->si_lock);
disk_info = (__le16 *) si->si_bh->b_data;

for (i = 0; i < si->si_size; i++)
si->si_global_node_nums[i] = le16_to_cpu(disk_info[i]);
-
- spin_unlock(&si->si_lock);
}

int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
@@ -76,8 +85,11 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb)

bh = si->si_bh;
ret = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0, si->si_inode);
- if (ret == 0)
+ if (ret == 0) {
+ spin_lock(&osb->osb_lock);
ocfs2_update_slot_info(si);
+ spin_unlock(&osb->osb_lock);
+ }

return ret;
}
@@ -90,10 +102,10 @@ static int ocfs2_update_disk_slots(struct ocfs2_super *osb,
int status, i;
__le16 *disk_info = (__le16 *) si->si_bh->b_data;

- spin_lock(&si->si_lock);
+ spin_lock(&osb->osb_lock);
for (i = 0; i < si->si_size; i++)
disk_info[i] = cpu_to_le16(si->si_global_node_nums[i]);
- spin_unlock(&si->si_lock);
+ spin_unlock(&osb->osb_lock);

status = ocfs2_write_block(osb, si->si_bh, si->si_inode);
if (status < 0)
@@ -119,7 +131,8 @@ static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
return ret;
}

-static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si, s16 preferred)
+static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si,
+ s16 preferred)
{
int i;
s16 ret = OCFS2_INVALID_SLOT;
@@ -141,15 +154,36 @@ out:
return ret;
}

-s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
- s16 global)
+int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num)
{
- s16 ret;
+ s16 slot;
+ struct ocfs2_slot_info *si = osb->slot_info;

- spin_lock(&si->si_lock);
- ret = __ocfs2_node_num_to_slot(si, global);
- spin_unlock(&si->si_lock);
- return ret;
+ spin_lock(&osb->osb_lock);
+ slot = __ocfs2_node_num_to_slot(si, node_num);
+ spin_unlock(&osb->osb_lock);
+
+ if (slot == OCFS2_INVALID_SLOT)
+ return -ENOENT;
+
+ return slot;
+}
+
+int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num,
+ unsigned int *node_num)
+{
+ struct ocfs2_slot_info *si = osb->slot_info;
+
+ assert_spin_locked(&osb->osb_lock);
+
+ BUG_ON(slot_num < 0);
+ BUG_ON(slot_num > osb->max_slots);
+
+ if (si->si_global_node_nums[slot_num] == OCFS2_INVALID_SLOT)
+ return -ENOENT;
+
+ *node_num = si->si_global_node_nums[slot_num];
+ return 0;
}

static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si)
@@ -184,9 +218,9 @@ int ocfs2_clear_slot(struct ocfs2_super *osb, s16 slot_num)
if (si == NULL)
return 0;

- spin_lock(&si->si_lock);
+ spin_lock(&osb->osb_lock);
__ocfs2_fill_slot(si, slot_num, OCFS2_INVALID_SLOT);
- spin_unlock(&si->si_lock);
+ spin_unlock(&osb->osb_lock);

return ocfs2_update_disk_slots(osb, osb->slot_info);
}
@@ -206,7 +240,6 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
goto bail;
}

- spin_lock_init(&si->si_lock);
si->si_num_slots = osb->max_slots;
si->si_size = OCFS2_MAX_SLOTS;

@@ -235,7 +268,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)

si->si_inode = inode;
si->si_bh = bh;
- osb->slot_info = si;
+ osb->slot_info = (struct ocfs2_slot_info *)si;
bail:
if (status < 0 && si)
__ocfs2_free_slot_info(si);
@@ -261,9 +294,9 @@ int ocfs2_find_slot(struct ocfs2_super *osb)

si = osb->slot_info;

+ spin_lock(&osb->osb_lock);
ocfs2_update_slot_info(si);

- spin_lock(&si->si_lock);
/* search for ourselves first and take the slot if it already
* exists. Perhaps we need to mark this in a variable for our
* own journal recovery? Possibly not, though we certainly
@@ -274,7 +307,7 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
* one. */
slot = __ocfs2_find_empty_slot(si, osb->preferred_slot);
if (slot == OCFS2_INVALID_SLOT) {
- spin_unlock(&si->si_lock);
+ spin_unlock(&osb->osb_lock);
mlog(ML_ERROR, "no free slots available!\n");
status = -EINVAL;
goto bail;
@@ -285,7 +318,7 @@ int ocfs2_find_slot(struct ocfs2_super *osb)

__ocfs2_fill_slot(si, slot, osb->node_num);
osb->slot_num = slot;
- spin_unlock(&si->si_lock);
+ spin_unlock(&osb->osb_lock);

mlog(0, "taking node slot %d\n", osb->slot_num);

@@ -306,12 +339,12 @@ void ocfs2_put_slot(struct ocfs2_super *osb)
if (!si)
return;

+ spin_lock(&osb->osb_lock);
ocfs2_update_slot_info(si);

- spin_lock(&si->si_lock);
__ocfs2_fill_slot(si, osb->slot_num, OCFS2_INVALID_SLOT);
osb->slot_num = OCFS2_INVALID_SLOT;
- spin_unlock(&si->si_lock);
+ spin_unlock(&osb->osb_lock);

status = ocfs2_update_disk_slots(osb, si);
if (status < 0) {
diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h
index b029ffd..5118e89 100644
--- a/fs/ocfs2/slot_map.h
+++ b/fs/ocfs2/slot_map.h
@@ -27,16 +27,6 @@
#ifndef SLOTMAP_H
#define SLOTMAP_H

-struct ocfs2_slot_info {
- spinlock_t si_lock;
-
- struct inode *si_inode;
- struct buffer_head *si_bh;
- unsigned int si_num_slots;
- unsigned int si_size;
- s16 si_global_node_nums[OCFS2_MAX_SLOTS];
-};
-
int ocfs2_init_slot_info(struct ocfs2_super *osb);
void ocfs2_free_slot_info(struct ocfs2_super *osb);

@@ -45,17 +35,10 @@ void ocfs2_put_slot(struct ocfs2_super *osb);

int ocfs2_refresh_slot_info(struct ocfs2_super *osb);

-s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
- s16 global);
-int ocfs2_clear_slot(struct ocfs2_super *osb, s16 slot_num);
+int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num);
+int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num,
+ unsigned int *node_num);

-static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si,
- int slot_num)
-{
- BUG_ON(slot_num == OCFS2_INVALID_SLOT);
- assert_spin_locked(&si->si_lock);
-
- return si->si_global_node_nums[slot_num] == OCFS2_INVALID_SLOT;
-}
+int ocfs2_clear_slot(struct ocfs2_super *osb, s16 slot_num);

#endif
--
1.5.3.8

2008-03-05 22:56:40

by Joel Becker

[permalink] [raw]
Subject: [PATCH 5/7] ocfs2: De-magic the in-memory slot map.

The in-memory slot map uses the same magic as the on-disk one. There is
a special value to mark a slot as invalid. It relies on the size of
certain types and so on.

Write a new in-memory map that keeps validity as a separate field. Outside
of the I/O functions, OCFS2_INVALID_SLOT now means what it is supposed to.
It also is no longer tied to the type size.

This also means that only the I/O functions refer to 16bit quantities.

Signed-off-by: Joel Becker <[email protected]>
---
fs/ocfs2/journal.c | 2 +-
fs/ocfs2/ocfs2.h | 6 +-
fs/ocfs2/slot_map.c | 130 ++++++++++++++++++++++++++++-----------------------
fs/ocfs2/slot_map.h | 2 +-
4 files changed, 77 insertions(+), 63 deletions(-)

diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index fe679f3..4cd982a 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -71,7 +71,7 @@ static int ocfs2_commit_thread(void *arg);
*/

struct ocfs2_recovery_map {
- int rm_used;
+ unsigned int rm_used;
unsigned int *rm_entries;
};

diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index c6ed8c3..95f783d 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -216,10 +216,10 @@ struct ocfs2_super
unsigned long s_mount_opt;
unsigned int s_atime_quantum;

- u16 max_slots;
+ unsigned int max_slots;
s16 node_num;
- s16 slot_num;
- s16 preferred_slot;
+ int slot_num;
+ int preferred_slot;
int s_sectsize_bits;
int s_clustersize;
int s_clustersize_bits;
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 5bddee1..65a61bf 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -42,21 +42,41 @@

#include "buffer_head_io.h"

+
+struct ocfs2_slot {
+ int sl_valid;
+ unsigned int sl_node_num;
+};
+
struct ocfs2_slot_info {
struct inode *si_inode;
unsigned int si_blocks;
struct buffer_head **si_bh;
unsigned int si_num_slots;
- unsigned int si_size;
- s16 si_global_node_nums[OCFS2_MAX_SLOTS];
+ struct ocfs2_slot *si_slots;
};


-static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
- s16 global);
-static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
- s16 slot_num,
- s16 node_num);
+static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
+ unsigned int node_num);
+
+static void ocfs2_invalidate_slot(struct ocfs2_slot_info *si,
+ int slot_num)
+{
+ BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots));
+ si->si_slots[slot_num].sl_valid = 0;
+}
+
+static void ocfs2_set_slot(struct ocfs2_slot_info *si,
+ int slot_num, unsigned int node_num)
+{
+ BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots));
+ BUG_ON((node_num == O2NM_INVALID_NODE_NUM) ||
+ (node_num >= O2NM_MAX_NODES));
+
+ si->si_slots[slot_num].sl_valid = 1;
+ si->si_slots[slot_num].sl_node_num = node_num;
+}

/*
* Post the slot information on disk into our slot_info struct.
@@ -71,8 +91,12 @@ static void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
* should've made sure we have the most recent copy. */
disk_info = (__le16 *) si->si_bh[0]->b_data;

- for (i = 0; i < si->si_size; i++)
- si->si_global_node_nums[i] = le16_to_cpu(disk_info[i]);
+ for (i = 0; i < si->si_num_slots; i++) {
+ if (le16_to_cpu(disk_info[i]) == (u16)OCFS2_INVALID_SLOT)
+ ocfs2_invalidate_slot(si, i);
+ else
+ ocfs2_set_slot(si, i, le16_to_cpu(disk_info[i]));
+ }
}

int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
@@ -114,8 +138,13 @@ static int ocfs2_update_disk_slots(struct ocfs2_super *osb,
__le16 *disk_info = (__le16 *) si->si_bh[0]->b_data;

spin_lock(&osb->osb_lock);
- for (i = 0; i < si->si_size; i++)
- disk_info[i] = cpu_to_le16(si->si_global_node_nums[i]);
+ for (i = 0; i < si->si_num_slots; i++) {
+ if (si->si_slots[i].sl_valid)
+ disk_info[i] =
+ cpu_to_le16(si->si_slots[i].sl_node_num);
+ else
+ disk_info[i] = cpu_to_le16(OCFS2_INVALID_SLOT);
+ }
spin_unlock(&osb->osb_lock);

status = ocfs2_write_block(osb, si->si_bh[0], si->si_inode);
@@ -147,39 +176,39 @@ static int ocfs2_slot_map_physical_size(struct ocfs2_super *osb,
return 0;
}

-/* try to find global node in the slot info. Returns
- * OCFS2_INVALID_SLOT if nothing is found. */
-static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
- s16 global)
+/* try to find global node in the slot info. Returns -ENOENT
+ * if nothing is found. */
+static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
+ unsigned int node_num)
{
- int i;
- s16 ret = OCFS2_INVALID_SLOT;
+ int i, ret = -ENOENT;

for(i = 0; i < si->si_num_slots; i++) {
- if (global == si->si_global_node_nums[i]) {
- ret = (s16) i;
+ if (si->si_slots[i].sl_valid &&
+ (node_num == si->si_slots[i].sl_node_num)) {
+ ret = i;
break;
}
}
+
return ret;
}

-static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si,
- s16 preferred)
+static int __ocfs2_find_empty_slot(struct ocfs2_slot_info *si,
+ int preferred)
{
- int i;
- s16 ret = OCFS2_INVALID_SLOT;
+ int i, ret = -ENOSPC;

- if (preferred >= 0 && preferred < si->si_num_slots) {
- if (OCFS2_INVALID_SLOT == si->si_global_node_nums[preferred]) {
+ if ((preferred >= 0) && (preferred < si->si_num_slots)) {
+ if (!si->si_slots[preferred].sl_valid) {
ret = preferred;
goto out;
}
}

for(i = 0; i < si->si_num_slots; i++) {
- if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) {
- ret = (s16) i;
+ if (!si->si_slots[i].sl_valid) {
+ ret = i;
break;
}
}
@@ -189,16 +218,13 @@ out:

int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num)
{
- s16 slot;
+ int slot;
struct ocfs2_slot_info *si = osb->slot_info;

spin_lock(&osb->osb_lock);
slot = __ocfs2_node_num_to_slot(si, node_num);
spin_unlock(&osb->osb_lock);

- if (slot == OCFS2_INVALID_SLOT)
- return -ENOENT;
-
return slot;
}

@@ -212,10 +238,10 @@ int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num,
BUG_ON(slot_num < 0);
BUG_ON(slot_num > osb->max_slots);

- if (si->si_global_node_nums[slot_num] == OCFS2_INVALID_SLOT)
+ if (!si->si_slots[slot_num].sl_valid)
return -ENOENT;

- *node_num = si->si_global_node_nums[slot_num];
+ *node_num = si->si_slots[slot_num].sl_node_num;
return 0;
}

@@ -241,19 +267,7 @@ static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si)
kfree(si);
}

-static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
- s16 slot_num,
- s16 node_num)
-{
- BUG_ON(slot_num == OCFS2_INVALID_SLOT);
- BUG_ON(slot_num >= si->si_num_slots);
- BUG_ON((node_num != O2NM_INVALID_NODE_NUM) &&
- (node_num >= O2NM_MAX_NODES));
-
- si->si_global_node_nums[slot_num] = node_num;
-}
-
-int ocfs2_clear_slot(struct ocfs2_super *osb, s16 slot_num)
+int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num)
{
struct ocfs2_slot_info *si = osb->slot_info;

@@ -261,7 +275,7 @@ int ocfs2_clear_slot(struct ocfs2_super *osb, s16 slot_num)
return 0;

spin_lock(&osb->osb_lock);
- __ocfs2_fill_slot(si, slot_num, OCFS2_INVALID_SLOT);
+ ocfs2_invalidate_slot(si, slot_num);
spin_unlock(&osb->osb_lock);

return ocfs2_update_disk_slots(osb, osb->slot_info);
@@ -324,11 +338,13 @@ bail:

int ocfs2_init_slot_info(struct ocfs2_super *osb)
{
- int status, i;
+ int status;
struct inode *inode = NULL;
struct ocfs2_slot_info *si;

- si = kzalloc(sizeof(struct ocfs2_slot_info), GFP_KERNEL);
+ si = kzalloc(sizeof(struct ocfs2_slot_info) +
+ (sizeof(struct ocfs2_slot) * osb->max_slots),
+ GFP_KERNEL);
if (!si) {
status = -ENOMEM;
mlog_errno(status);
@@ -336,10 +352,8 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
}

si->si_num_slots = osb->max_slots;
- si->si_size = OCFS2_MAX_SLOTS;
-
- for(i = 0; i < si->si_num_slots; i++)
- si->si_global_node_nums[i] = OCFS2_INVALID_SLOT;
+ si->si_slots = (struct ocfs2_slot *)((char *)si +
+ sizeof(struct ocfs2_slot_info));

inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT);
@@ -375,7 +389,7 @@ void ocfs2_free_slot_info(struct ocfs2_super *osb)
int ocfs2_find_slot(struct ocfs2_super *osb)
{
int status;
- s16 slot;
+ int slot;
struct ocfs2_slot_info *si;

mlog_entry_void();
@@ -390,11 +404,11 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
* own journal recovery? Possibly not, though we certainly
* need to warn to the user */
slot = __ocfs2_node_num_to_slot(si, osb->node_num);
- if (slot == OCFS2_INVALID_SLOT) {
+ if (slot < 0) {
/* if no slot yet, then just take 1st available
* one. */
slot = __ocfs2_find_empty_slot(si, osb->preferred_slot);
- if (slot == OCFS2_INVALID_SLOT) {
+ if (slot < 0) {
spin_unlock(&osb->osb_lock);
mlog(ML_ERROR, "no free slots available!\n");
status = -EINVAL;
@@ -404,7 +418,7 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
mlog(ML_NOTICE, "slot %d is already allocated to this node!\n",
slot);

- __ocfs2_fill_slot(si, slot, osb->node_num);
+ ocfs2_set_slot(si, slot, osb->node_num);
osb->slot_num = slot;
spin_unlock(&osb->osb_lock);

@@ -430,7 +444,7 @@ void ocfs2_put_slot(struct ocfs2_super *osb)
spin_lock(&osb->osb_lock);
ocfs2_update_slot_info(si);

- __ocfs2_fill_slot(si, osb->slot_num, OCFS2_INVALID_SLOT);
+ ocfs2_invalidate_slot(si, osb->slot_num);
osb->slot_num = OCFS2_INVALID_SLOT;
spin_unlock(&osb->osb_lock);

diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h
index 5118e89..601c95f 100644
--- a/fs/ocfs2/slot_map.h
+++ b/fs/ocfs2/slot_map.h
@@ -39,6 +39,6 @@ int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num);
int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num,
unsigned int *node_num);

-int ocfs2_clear_slot(struct ocfs2_super *osb, s16 slot_num);
+int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num);

#endif
--
1.5.3.8

2008-03-05 22:57:13

by Joel Becker

[permalink] [raw]
Subject: [PATCH 7/7] ocfs2: New slot map format

The old slot map had a few limitations:

- It was limited to one block, so the maximum slot count was 255.
- Each slot was signed 16bits, limiting node numbers to INT16_MAX.
- An empty slot was marked by the magic 0xFFFF (-1).

The new slot map format provides 32bit node numbers (UINT32_MAX), a
separate space to mark a slot in use, and extra room to grow. The slot
map is now bounded by i_size, not a block.

Signed-off-by: Joel Becker <[email protected]>
---
fs/ocfs2/ocfs2.h | 7 +++
fs/ocfs2/ocfs2_fs.h | 31 +++++++++++++-
fs/ocfs2/slot_map.c | 110 +++++++++++++++++++++++++++++++++++++++++++++------
3 files changed, 133 insertions(+), 15 deletions(-)

diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 95f783d..f78e9ed 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -374,6 +374,13 @@ static inline int ocfs2_mount_local(struct ocfs2_super *osb)
return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT);
}

+static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
+{
+ return (osb->s_feature_incompat &
+ OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP);
+}
+
+
#define OCFS2_IS_VALID_DINODE(ptr) \
(!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))

diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 3299116..c495023 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -88,7 +88,8 @@
#define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB
#define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \
| OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \
- | OCFS2_FEATURE_INCOMPAT_INLINE_DATA)
+ | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \
+ | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP)
#define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN

/*
@@ -125,6 +126,10 @@
/* Support for data packed into inode blocks */
#define OCFS2_FEATURE_INCOMPAT_INLINE_DATA 0x0040

+/* Support for the extended slot map */
+#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100
+
+
/*
* backup superblock flag is used to indicate that this volume
* has backup superblocks.
@@ -476,7 +481,8 @@ struct ocfs2_extent_block

/*
* On disk slot map for OCFS2. This defines the contents of the "slot_map"
- * system file.
+ * system file. A slot is valid if it contains a node number >= 0. The
+ * value -1 (0xFFFF) is OCFS2_INVALID_SLOT. This marks a slot empty.
*/
struct ocfs2_slot_map {
/*00*/ __le16 sm_slots[0];
@@ -486,6 +492,27 @@ struct ocfs2_slot_map {
*/
};

+struct ocfs2_extended_slot {
+/*00*/ __u8 es_valid;
+ __u8 es_reserved1[3];
+ __le32 es_node_num;
+/*10*/
+};
+
+/*
+ * The extended slot map, used when OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP
+ * is set. It separates out the valid marker from the node number, and
+ * has room to grow. Unlike the old slot map, this format is defined by
+ * i_size.
+ */
+struct ocfs2_slot_map_extended {
+/*00*/ struct ocfs2_extended_slot se_slots[0];
+/*
+ * Actual size is i_size of the slot_map system file. It should
+ * match s_max_slots * sizeof(struct ocfs2_extended_slot)
+ */
+};
+
/*
* On disk superblock for OCFS2
* Note that it is contained inside an ocfs2_dinode, so all offsets
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index e7e7a74..63fb1b2 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -49,6 +49,8 @@ struct ocfs2_slot {
};

struct ocfs2_slot_info {
+ int si_extended;
+ int si_slots_per_block;
struct inode *si_inode;
unsigned int si_blocks;
struct buffer_head **si_bh;
@@ -78,17 +80,37 @@ static void ocfs2_set_slot(struct ocfs2_slot_info *si,
si->si_slots[slot_num].sl_node_num = node_num;
}

+/* This version is for the extended slot map */
+static void ocfs2_update_slot_info_extended(struct ocfs2_slot_info *si)
+{
+ int b, i, slotno;
+ struct ocfs2_slot_map_extended *se;
+
+ slotno = 0;
+ for (b = 0; b < si->si_blocks; b++) {
+ se = (struct ocfs2_slot_map_extended *)si->si_bh[b]->b_data;
+ for (i = 0;
+ (i < si->si_slots_per_block) &&
+ (slotno < si->si_num_slots);
+ i++, slotno++) {
+ if (se->se_slots[i].es_valid)
+ ocfs2_set_slot(si, slotno,
+ le32_to_cpu(se->se_slots[i].es_node_num));
+ else
+ ocfs2_invalidate_slot(si, slotno);
+ }
+ }
+}
+
/*
* Post the slot information on disk into our slot_info struct.
* Must be protected by osb_lock.
*/
-static void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
+static void ocfs2_update_slot_info_old(struct ocfs2_slot_info *si)
{
int i;
struct ocfs2_slot_map *sm;

- /* we don't read the slot block here as ocfs2_super_lock
- * should've made sure we have the most recent copy. */
sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data;

for (i = 0; i < si->si_num_slots; i++) {
@@ -99,6 +121,18 @@ static void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
}
}

+static void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
+{
+ /*
+ * The slot data will have been refreshed when ocfs2_super_lock
+ * was taken.
+ */
+ if (si->si_extended)
+ ocfs2_update_slot_info_extended(si);
+ else
+ ocfs2_update_slot_info_old(si);
+}
+
int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
{
int ret;
@@ -131,13 +165,31 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb)

/* post the our slot info stuff into it's destination bh and write it
* out. */
-static int ocfs2_update_disk_slots(struct ocfs2_super *osb,
- struct ocfs2_slot_info *si)
+static void ocfs2_update_disk_slot_extended(struct ocfs2_slot_info *si,
+ int slot_num,
+ struct buffer_head **bh)
+{
+ int blkind = slot_num / si->si_slots_per_block;
+ int slotno = slot_num % si->si_slots_per_block;
+ struct ocfs2_slot_map_extended *se;
+
+ BUG_ON(blkind >= si->si_blocks);
+
+ se = (struct ocfs2_slot_map_extended *)si->si_bh[blkind]->b_data;
+ se->se_slots[slotno].es_valid = si->si_slots[slot_num].sl_valid;
+ if (si->si_slots[slot_num].sl_valid)
+ se->se_slots[slotno].es_node_num =
+ cpu_to_le32(si->si_slots[slot_num].sl_node_num);
+ *bh = si->si_bh[blkind];
+}
+
+static void ocfs2_update_disk_slot_old(struct ocfs2_slot_info *si,
+ int slot_num,
+ struct buffer_head **bh)
{
- int status, i;
+ int i;
struct ocfs2_slot_map *sm;

- spin_lock(&osb->osb_lock);
sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data;
for (i = 0; i < si->si_num_slots; i++) {
if (si->si_slots[i].sl_valid)
@@ -146,9 +198,24 @@ static int ocfs2_update_disk_slots(struct ocfs2_super *osb,
else
sm->sm_slots[i] = cpu_to_le16(OCFS2_INVALID_SLOT);
}
+ *bh = si->si_bh[0];
+}
+
+static int ocfs2_update_disk_slot(struct ocfs2_super *osb,
+ struct ocfs2_slot_info *si,
+ int slot_num)
+{
+ int status;
+ struct buffer_head *bh;
+
+ spin_lock(&osb->osb_lock);
+ if (si->si_extended)
+ ocfs2_update_disk_slot_extended(si, slot_num, &bh);
+ else
+ ocfs2_update_disk_slot_old(si, slot_num, &bh);
spin_unlock(&osb->osb_lock);

- status = ocfs2_write_block(osb, si->si_bh[0], si->si_inode);
+ status = ocfs2_write_block(osb, bh, si->si_inode);
if (status < 0)
mlog_errno(status);

@@ -165,7 +232,12 @@ static int ocfs2_slot_map_physical_size(struct ocfs2_super *osb,
{
unsigned long long bytes_needed;

- bytes_needed = osb->max_slots * sizeof(__le16);
+ if (ocfs2_uses_extended_slot_map(osb)) {
+ bytes_needed = osb->max_slots *
+ sizeof(struct ocfs2_extended_slot);
+ } else {
+ bytes_needed = osb->max_slots * sizeof(__le16);
+ }
if (bytes_needed > i_size_read(inode)) {
mlog(ML_ERROR,
"Slot map file is too small! (size %llu, needed %llu)\n",
@@ -279,7 +351,7 @@ int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num)
ocfs2_invalidate_slot(si, slot_num);
spin_unlock(&osb->osb_lock);

- return ocfs2_update_disk_slots(osb, osb->slot_info);
+ return ocfs2_update_disk_slot(osb, osb->slot_info, slot_num);
}

static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
@@ -301,6 +373,16 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
if (!si->si_blocks)
goto bail;

+ if (si->si_extended)
+ si->si_slots_per_block =
+ (osb->sb->s_blocksize /
+ sizeof(struct ocfs2_extended_slot));
+ else
+ si->si_slots_per_block = osb->sb->s_blocksize / sizeof(__le16);
+
+ /* The size checks above should ensure this */
+ BUG_ON((osb->max_slots / si->si_slots_per_block) > blocks);
+
mlog(0, "Slot map needs %u buffers for %llu bytes\n",
si->si_blocks, bytes);

@@ -352,6 +434,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
goto bail;
}

+ si->si_extended = ocfs2_uses_extended_slot_map(osb);
si->si_num_slots = osb->max_slots;
si->si_slots = (struct ocfs2_slot *)((char *)si +
sizeof(struct ocfs2_slot_info));
@@ -425,7 +508,7 @@ int ocfs2_find_slot(struct ocfs2_super *osb)

mlog(0, "taking node slot %d\n", osb->slot_num);

- status = ocfs2_update_disk_slots(osb, si);
+ status = ocfs2_update_disk_slot(osb, si, osb->slot_num);
if (status < 0)
mlog_errno(status);

@@ -436,7 +519,7 @@ bail:

void ocfs2_put_slot(struct ocfs2_super *osb)
{
- int status;
+ int status, slot_num;
struct ocfs2_slot_info *si = osb->slot_info;

if (!si)
@@ -445,11 +528,12 @@ void ocfs2_put_slot(struct ocfs2_super *osb)
spin_lock(&osb->osb_lock);
ocfs2_update_slot_info(si);

+ slot_num = osb->slot_num;
ocfs2_invalidate_slot(si, osb->slot_num);
osb->slot_num = OCFS2_INVALID_SLOT;
spin_unlock(&osb->osb_lock);

- status = ocfs2_update_disk_slots(osb, si);
+ status = ocfs2_update_disk_slot(osb, si, slot_num);
if (status < 0) {
mlog_errno(status);
goto bail;
--
1.5.3.8

2008-03-05 22:57:41

by Joel Becker

[permalink] [raw]
Subject: [PATCH 1/7] ocfs2: Move slot map access into slot_map.c

From: Mark Fasheh <[email protected]>

journal.c and dlmglue.c would refresh the slot map by hand. Instead, have
the update and clear functions do the work inside slot_map.c. The eventual
result is to make ocfs2_slot_info defined privately in slot_map.c

Signed-off-by: Joel Becker <[email protected]>
---
fs/ocfs2/dlmglue.c | 8 +-----
fs/ocfs2/journal.c | 3 +-
fs/ocfs2/slot_map.c | 62 +++++++++++++++++++++++++++++++++++++++-----------
fs/ocfs2/slot_map.h | 11 +++-----
fs/ocfs2/super.c | 3 +-
5 files changed, 55 insertions(+), 32 deletions(-)

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index f779430..33c8a65 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2132,8 +2132,6 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
int status = 0;
int level = ex ? LKM_EXMODE : LKM_PRMODE;
struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
- struct buffer_head *bh;
- struct ocfs2_slot_info *si = osb->slot_info;

mlog_entry_void();

@@ -2159,11 +2157,7 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
goto bail;
}
if (status) {
- bh = si->si_bh;
- status = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0,
- si->si_inode);
- if (status == 0)
- ocfs2_update_slot_info(si);
+ status = ocfs2_refresh_slot_info(osb);

ocfs2_complete_lock_res_refresh(lockres, status);

diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index f31c7e8..c2e654e 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1123,8 +1123,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,

/* Likewise, this would be a strange but ultimately not so
* harmful place to get an error... */
- ocfs2_clear_slot(si, slot_num);
- status = ocfs2_update_disk_slots(osb, si);
+ status = ocfs2_clear_slot(osb, slot_num);
if (status < 0)
mlog_errno(status);

diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 3a50ce5..f5727b8 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -49,7 +49,7 @@ static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
s16 node_num);

/* post the slot information on disk into our slot_info struct. */
-void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
+static void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
{
int i;
__le16 *disk_info;
@@ -65,10 +65,27 @@ void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
spin_unlock(&si->si_lock);
}

+int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
+{
+ int ret;
+ struct ocfs2_slot_info *si = osb->slot_info;
+ struct buffer_head *bh;
+
+ if (si == NULL)
+ return 0;
+
+ bh = si->si_bh;
+ ret = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0, si->si_inode);
+ if (ret == 0)
+ ocfs2_update_slot_info(si);
+
+ return ret;
+}
+
/* post the our slot info stuff into it's destination bh and write it
* out. */
-int ocfs2_update_disk_slots(struct ocfs2_super *osb,
- struct ocfs2_slot_info *si)
+static int ocfs2_update_disk_slots(struct ocfs2_super *osb,
+ struct ocfs2_slot_info *si)
{
int status, i;
__le16 *disk_info = (__le16 *) si->si_bh->b_data;
@@ -135,6 +152,19 @@ s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
return ret;
}

+static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si)
+{
+ if (si == NULL)
+ return;
+
+ if (si->si_inode)
+ iput(si->si_inode);
+ if (si->si_bh)
+ brelse(si->si_bh);
+
+ kfree(si);
+}
+
static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
s16 slot_num,
s16 node_num)
@@ -147,12 +177,18 @@ static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
si->si_global_node_nums[slot_num] = node_num;
}

-void ocfs2_clear_slot(struct ocfs2_slot_info *si,
- s16 slot_num)
+int ocfs2_clear_slot(struct ocfs2_super *osb, s16 slot_num)
{
+ struct ocfs2_slot_info *si = osb->slot_info;
+
+ if (si == NULL)
+ return 0;
+
spin_lock(&si->si_lock);
__ocfs2_fill_slot(si, slot_num, OCFS2_INVALID_SLOT);
spin_unlock(&si->si_lock);
+
+ return ocfs2_update_disk_slots(osb, osb->slot_info);
}

int ocfs2_init_slot_info(struct ocfs2_super *osb)
@@ -202,18 +238,17 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
osb->slot_info = si;
bail:
if (status < 0 && si)
- ocfs2_free_slot_info(si);
+ __ocfs2_free_slot_info(si);

return status;
}

-void ocfs2_free_slot_info(struct ocfs2_slot_info *si)
+void ocfs2_free_slot_info(struct ocfs2_super *osb)
{
- if (si->si_inode)
- iput(si->si_inode);
- if (si->si_bh)
- brelse(si->si_bh);
- kfree(si);
+ struct ocfs2_slot_info *si = osb->slot_info;
+
+ osb->slot_info = NULL;
+ __ocfs2_free_slot_info(si);
}

int ocfs2_find_slot(struct ocfs2_super *osb)
@@ -285,7 +320,6 @@ void ocfs2_put_slot(struct ocfs2_super *osb)
}

bail:
- osb->slot_info = NULL;
- ocfs2_free_slot_info(si);
+ ocfs2_free_slot_info(osb);
}

diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h
index 1025872..b029ffd 100644
--- a/fs/ocfs2/slot_map.h
+++ b/fs/ocfs2/slot_map.h
@@ -30,7 +30,7 @@
struct ocfs2_slot_info {
spinlock_t si_lock;

- struct inode *si_inode;
+ struct inode *si_inode;
struct buffer_head *si_bh;
unsigned int si_num_slots;
unsigned int si_size;
@@ -38,19 +38,16 @@ struct ocfs2_slot_info {
};

int ocfs2_init_slot_info(struct ocfs2_super *osb);
-void ocfs2_free_slot_info(struct ocfs2_slot_info *si);
+void ocfs2_free_slot_info(struct ocfs2_super *osb);

int ocfs2_find_slot(struct ocfs2_super *osb);
void ocfs2_put_slot(struct ocfs2_super *osb);

-void ocfs2_update_slot_info(struct ocfs2_slot_info *si);
-int ocfs2_update_disk_slots(struct ocfs2_super *osb,
- struct ocfs2_slot_info *si);
+int ocfs2_refresh_slot_info(struct ocfs2_super *osb);

s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
s16 global);
-void ocfs2_clear_slot(struct ocfs2_slot_info *si,
- s16 slot_num);
+int ocfs2_clear_slot(struct ocfs2_super *osb, s16 slot_num);

static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si,
int slot_num)
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index bec75af..fad37af 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1724,8 +1724,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)

/* This function assumes that the caller has the main osb resource */

- if (osb->slot_info)
- ocfs2_free_slot_info(osb->slot_info);
+ ocfs2_free_slot_info(osb);

kfree(osb->osb_orphan_wipes);
/* FIXME
--
1.5.3.8

2008-03-05 22:58:48

by Joel Becker

[permalink] [raw]
Subject: [PATCH 4/7] ocfs2: slot_map I/O based on max_slots.

The slot map code assumed a slot_map file has one block allocated.
This changes the code to I/O as many blocks as will cover max_slots.

Signed-off-by: Joel Becker <[email protected]>
---
fs/ocfs2/slot_map.c | 128 +++++++++++++++++++++++++++++++++++++++++++--------
1 files changed, 108 insertions(+), 20 deletions(-)

diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 762360d..5bddee1 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -44,7 +44,8 @@

struct ocfs2_slot_info {
struct inode *si_inode;
- struct buffer_head *si_bh;
+ unsigned int si_blocks;
+ struct buffer_head **si_bh;
unsigned int si_num_slots;
unsigned int si_size;
s16 si_global_node_nums[OCFS2_MAX_SLOTS];
@@ -68,7 +69,7 @@ static void ocfs2_update_slot_info(struct ocfs2_slot_info *si)

/* we don't read the slot block here as ocfs2_super_lock
* should've made sure we have the most recent copy. */
- disk_info = (__le16 *) si->si_bh->b_data;
+ disk_info = (__le16 *) si->si_bh[0]->b_data;

for (i = 0; i < si->si_size; i++)
si->si_global_node_nums[i] = le16_to_cpu(disk_info[i]);
@@ -78,13 +79,23 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
{
int ret;
struct ocfs2_slot_info *si = osb->slot_info;
- struct buffer_head *bh;

if (si == NULL)
return 0;

- bh = si->si_bh;
- ret = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0, si->si_inode);
+ BUG_ON(si->si_blocks == 0);
+ BUG_ON(si->si_bh == NULL);
+
+ mlog(0, "Refreshing slot map, reading %u block(s)\n",
+ si->si_blocks);
+
+ /*
+ * We pass -1 as blocknr because we expect all of si->si_bh to
+ * be !NULL. Thus, ocfs2_read_blocks() will ignore blocknr. If
+ * this is not true, the read of -1 (UINT64_MAX) will fail.
+ */
+ ret = ocfs2_read_blocks(osb, -1, si->si_blocks, si->si_bh, 0,
+ si->si_inode);
if (ret == 0) {
spin_lock(&osb->osb_lock);
ocfs2_update_slot_info(si);
@@ -100,20 +111,42 @@ static int ocfs2_update_disk_slots(struct ocfs2_super *osb,
struct ocfs2_slot_info *si)
{
int status, i;
- __le16 *disk_info = (__le16 *) si->si_bh->b_data;
+ __le16 *disk_info = (__le16 *) si->si_bh[0]->b_data;

spin_lock(&osb->osb_lock);
for (i = 0; i < si->si_size; i++)
disk_info[i] = cpu_to_le16(si->si_global_node_nums[i]);
spin_unlock(&osb->osb_lock);

- status = ocfs2_write_block(osb, si->si_bh, si->si_inode);
+ status = ocfs2_write_block(osb, si->si_bh[0], si->si_inode);
if (status < 0)
mlog_errno(status);

return status;
}

+/*
+ * Calculate how many bytes are needed by the slot map. Returns
+ * an error if the slot map file is too small.
+ */
+static int ocfs2_slot_map_physical_size(struct ocfs2_super *osb,
+ struct inode *inode,
+ unsigned long long *bytes)
+{
+ unsigned long long bytes_needed;
+
+ bytes_needed = osb->max_slots * sizeof(__le16);
+ if (bytes_needed > i_size_read(inode)) {
+ mlog(ML_ERROR,
+ "Slot map file is too small! (size %llu, needed %llu)\n",
+ i_size_read(inode), bytes_needed);
+ return -ENOSPC;
+ }
+
+ *bytes = bytes_needed;
+ return 0;
+}
+
/* try to find global node in the slot info. Returns
* OCFS2_INVALID_SLOT if nothing is found. */
static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
@@ -188,13 +221,22 @@ int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num,

static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si)
{
+ unsigned int i;
+
if (si == NULL)
return;

if (si->si_inode)
iput(si->si_inode);
- if (si->si_bh)
- brelse(si->si_bh);
+ if (si->si_bh) {
+ for (i = 0; i < si->si_blocks; i++) {
+ if (si->si_bh[i]) {
+ brelse(si->si_bh[i]);
+ si->si_bh[i] = NULL;
+ }
+ }
+ kfree(si->si_bh);
+ }

kfree(si);
}
@@ -225,12 +267,65 @@ int ocfs2_clear_slot(struct ocfs2_super *osb, s16 slot_num)
return ocfs2_update_disk_slots(osb, osb->slot_info);
}

+static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
+ struct ocfs2_slot_info *si)
+{
+ int status = 0;
+ u64 blkno;
+ unsigned long long blocks, bytes;
+ unsigned int i;
+ struct buffer_head *bh;
+
+ status = ocfs2_slot_map_physical_size(osb, si->si_inode, &bytes);
+ if (status)
+ goto bail;
+
+ blocks = ocfs2_blocks_for_bytes(si->si_inode->i_sb, bytes);
+ BUG_ON(blocks > UINT_MAX);
+ si->si_blocks = blocks;
+ if (!si->si_blocks)
+ goto bail;
+
+ mlog(0, "Slot map needs %u buffers for %llu bytes\n",
+ si->si_blocks, bytes);
+
+ si->si_bh = kzalloc(sizeof(struct buffer_head *) * si->si_blocks,
+ GFP_KERNEL);
+ if (!si->si_bh) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto bail;
+ }
+
+ for (i = 0; i < si->si_blocks; i++) {
+ status = ocfs2_extent_map_get_blocks(si->si_inode, i,
+ &blkno, NULL, NULL);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ mlog(0, "Reading slot map block %u at %llu\n", i,
+ (unsigned long long)blkno);
+
+ bh = NULL; /* Acquire a fresh bh */
+ status = ocfs2_read_block(osb, blkno, &bh, 0, si->si_inode);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ si->si_bh[i] = bh;
+ }
+
+bail:
+ return status;
+}
+
int ocfs2_init_slot_info(struct ocfs2_super *osb)
{
int status, i;
- u64 blkno;
struct inode *inode = NULL;
- struct buffer_head *bh = NULL;
struct ocfs2_slot_info *si;

si = kzalloc(sizeof(struct ocfs2_slot_info), GFP_KERNEL);
@@ -254,20 +349,13 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
goto bail;
}

- status = ocfs2_extent_map_get_blocks(inode, 0ULL, &blkno, NULL, NULL);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
- status = ocfs2_read_block(osb, blkno, &bh, 0, inode);
+ si->si_inode = inode;
+ status = ocfs2_map_slot_buffers(osb, si);
if (status < 0) {
mlog_errno(status);
goto bail;
}

- si->si_inode = inode;
- si->si_bh = bh;
osb->slot_info = (struct ocfs2_slot_info *)si;
bail:
if (status < 0 && si)
--
1.5.3.8

2008-03-05 22:59:21

by Joel Becker

[permalink] [raw]
Subject: [PATCH 3/7] ocfs2: Change the recovery map to an array of node numbers.

The old recovery map was a bitmap of node numbers. This was sufficient
for the maximum node number of 254. Going forward, we want node numbers
to be UINT32. Thus, we need a new recovery map.

Note that we can't keep track of slots here. We must write down the
node number to recovery *before* we get the locks needed to convert a
node number into a slot number.

The recovery map is now an array of unsigned ints, max_slots in size.
It moves to journal.c with the rest of recovery.

Because it needs to be initialized, we move all of recovery initialization
into a new function, ocfs2_recovery_init(). This actually cleans up
ocfs2_initialize_super() a little as well. Following on, recovery cleaup
becomes part of ocfs2_recovery_exit().

A number of node map functions are rendered obsolete and are removed.

Finally, waiting on recovery is wrapped in a function rather than naked
checks on the recovery_event. This is a cleanup from Mark.

Signed-off-by: Joel Becker <[email protected]>
---
fs/ocfs2/dlmglue.c | 6 +-
fs/ocfs2/heartbeat.c | 111 ------------------------------
fs/ocfs2/heartbeat.h | 14 ----
fs/ocfs2/journal.c | 181 +++++++++++++++++++++++++++++++++++++++++++++----
fs/ocfs2/journal.h | 4 +
fs/ocfs2/ocfs2.h | 3 +-
fs/ocfs2/super.c | 33 ++-------
7 files changed, 182 insertions(+), 170 deletions(-)

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 33c8a65..b4108fe 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1950,8 +1950,7 @@ int ocfs2_inode_lock_full(struct inode *inode,
goto local;

if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
- wait_event(osb->recovery_event,
- ocfs2_node_map_is_empty(osb, &osb->recovery_map));
+ ocfs2_wait_for_recovery(osb);

lockres = &OCFS2_I(inode)->ip_inode_lockres;
level = ex ? LKM_EXMODE : LKM_PRMODE;
@@ -1974,8 +1973,7 @@ int ocfs2_inode_lock_full(struct inode *inode,
* committed to owning this lock so we don't allow signals to
* abort the operation. */
if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
- wait_event(osb->recovery_event,
- ocfs2_node_map_is_empty(osb, &osb->recovery_map));
+ ocfs2_wait_for_recovery(osb);

local:
/*
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index 0758daf..80de239 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -48,7 +48,6 @@ static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
int bit);
static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
int bit);
-static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map);

/* special case -1 for now
* TODO: should *really* make sure the calling func never passes -1!! */
@@ -62,7 +61,6 @@ static void ocfs2_node_map_init(struct ocfs2_node_map *map)
void ocfs2_init_node_maps(struct ocfs2_super *osb)
{
spin_lock_init(&osb->node_map_lock);
- ocfs2_node_map_init(&osb->recovery_map);
ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs);
}

@@ -192,112 +190,3 @@ int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
return ret;
}

-static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map)
-{
- int bit;
- bit = find_next_bit(map->map, map->num_nodes, 0);
- if (bit < map->num_nodes)
- return 0;
- return 1;
-}
-
-int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
- struct ocfs2_node_map *map)
-{
- int ret;
- BUG_ON(map->num_nodes == 0);
- spin_lock(&osb->node_map_lock);
- ret = __ocfs2_node_map_is_empty(map);
- spin_unlock(&osb->node_map_lock);
- return ret;
-}
-
-#if 0
-
-static void __ocfs2_node_map_dup(struct ocfs2_node_map *target,
- struct ocfs2_node_map *from)
-{
- BUG_ON(from->num_nodes == 0);
- ocfs2_node_map_init(target);
- __ocfs2_node_map_set(target, from);
-}
-
-/* returns 1 if bit is the only bit set in target, 0 otherwise */
-int ocfs2_node_map_is_only(struct ocfs2_super *osb,
- struct ocfs2_node_map *target,
- int bit)
-{
- struct ocfs2_node_map temp;
- int ret;
-
- spin_lock(&osb->node_map_lock);
- __ocfs2_node_map_dup(&temp, target);
- __ocfs2_node_map_clear_bit(&temp, bit);
- ret = __ocfs2_node_map_is_empty(&temp);
- spin_unlock(&osb->node_map_lock);
-
- return ret;
-}
-
-static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
- struct ocfs2_node_map *from)
-{
- int num_longs, i;
-
- BUG_ON(target->num_nodes != from->num_nodes);
- BUG_ON(target->num_nodes == 0);
-
- num_longs = BITS_TO_LONGS(target->num_nodes);
- for (i = 0; i < num_longs; i++)
- target->map[i] = from->map[i];
-}
-
-#endif /* 0 */
-
-/* Returns whether the recovery bit was actually set - it may not be
- * if a node is still marked as needing recovery */
-int ocfs2_recovery_map_set(struct ocfs2_super *osb,
- int num)
-{
- int set = 0;
-
- spin_lock(&osb->node_map_lock);
-
- if (!test_bit(num, osb->recovery_map.map)) {
- __ocfs2_node_map_set_bit(&osb->recovery_map, num);
- set = 1;
- }
-
- spin_unlock(&osb->node_map_lock);
-
- return set;
-}
-
-void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
- int num)
-{
- ocfs2_node_map_clear_bit(osb, &osb->recovery_map, num);
-}
-
-int ocfs2_node_map_iterate(struct ocfs2_super *osb,
- struct ocfs2_node_map *map,
- int idx)
-{
- int i = idx;
-
- idx = O2NM_INVALID_NODE_NUM;
- spin_lock(&osb->node_map_lock);
- if ((i != O2NM_INVALID_NODE_NUM) &&
- (i >= 0) &&
- (i < map->num_nodes)) {
- while(i < map->num_nodes) {
- if (test_bit(i, map->map)) {
- idx = i;
- break;
- }
- i++;
- }
- }
- spin_unlock(&osb->node_map_lock);
- return idx;
-}
diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h
index eac63ae..98d8ffc 100644
--- a/fs/ocfs2/heartbeat.h
+++ b/fs/ocfs2/heartbeat.h
@@ -33,8 +33,6 @@ void ocfs2_stop_heartbeat(struct ocfs2_super *osb);

/* node map functions - used to keep track of mounted and in-recovery
* nodes. */
-int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
- struct ocfs2_node_map *map);
void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
struct ocfs2_node_map *map,
int bit);
@@ -44,17 +42,5 @@ void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
struct ocfs2_node_map *map,
int bit);
-int ocfs2_node_map_iterate(struct ocfs2_super *osb,
- struct ocfs2_node_map *map,
- int idx);
-static inline int ocfs2_node_map_first_set_bit(struct ocfs2_super *osb,
- struct ocfs2_node_map *map)
-{
- return ocfs2_node_map_iterate(osb, map, 0);
-}
-int ocfs2_recovery_map_set(struct ocfs2_super *osb,
- int num);
-void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
- int num);

#endif /* OCFS2_HEARTBEAT_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index ed0c6d0..fe679f3 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -64,6 +64,137 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
int slot);
static int ocfs2_commit_thread(void *arg);

+
+/*
+ * The recovery_list is a simple linked list of node numbers to recover.
+ * It is protected by the recovery_lock.
+ */
+
+struct ocfs2_recovery_map {
+ int rm_used;
+ unsigned int *rm_entries;
+};
+
+int ocfs2_recovery_init(struct ocfs2_super *osb)
+{
+ struct ocfs2_recovery_map *rm;
+
+ mutex_init(&osb->recovery_lock);
+ osb->disable_recovery = 0;
+ osb->recovery_thread_task = NULL;
+ init_waitqueue_head(&osb->recovery_event);
+
+ rm = kzalloc(sizeof(struct ocfs2_recovery_map) +
+ osb->max_slots * sizeof(unsigned int),
+ GFP_KERNEL);
+ if (!rm) {
+ mlog_errno(-ENOMEM);
+ return -ENOMEM;
+ }
+
+ rm->rm_entries = (unsigned int *)((char *)rm +
+ sizeof(struct ocfs2_recovery_map));
+ osb->recovery_map = rm;
+
+ return 0;
+}
+
+/* we can't grab the goofy sem lock from inside wait_event, so we use
+ * memory barriers to make sure that we'll see the null task before
+ * being woken up */
+static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)
+{
+ mb();
+ return osb->recovery_thread_task != NULL;
+}
+
+void ocfs2_recovery_exit(struct ocfs2_super *osb)
+{
+ struct ocfs2_recovery_map *rm;
+
+ /* disable any new recovery threads and wait for any currently
+ * running ones to exit. Do this before setting the vol_state. */
+ mutex_lock(&osb->recovery_lock);
+ osb->disable_recovery = 1;
+ mutex_unlock(&osb->recovery_lock);
+ wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
+
+ /* At this point, we know that no more recovery threads can be
+ * launched, so wait for any recovery completion work to
+ * complete. */
+ flush_workqueue(ocfs2_wq);
+
+ /*
+ * Now that recovery is shut down, and the osb is about to be
+ * freed, the osb_lock is not taken here.
+ */
+ rm = osb->recovery_map;
+ /* XXX: Should we bug if there are dirty entries? */
+
+ kfree(rm);
+}
+
+/* Behaves like test-and-set. Returns the previous value */
+static int __ocfs2_recovery_map_test(struct ocfs2_super *osb,
+ unsigned int node_num)
+{
+ int i;
+ struct ocfs2_recovery_map *rm = osb->recovery_map;
+
+ assert_spin_locked(&osb->osb_lock);
+
+ for (i = 0; i < rm->rm_used; i++) {
+ if (rm->rm_entries[i] == node_num)
+ return 1;
+ }
+
+ return 0;
+}
+
+static int ocfs2_recovery_map_set(struct ocfs2_super *osb,
+ unsigned int node_num)
+{
+ struct ocfs2_recovery_map *rm = osb->recovery_map;
+
+ spin_lock(&osb->osb_lock);
+ if (__ocfs2_recovery_map_test(osb, node_num)) {
+ spin_unlock(&osb->osb_lock);
+ return 1;
+ }
+
+ /* XXX: Can this be exploited? Not from o2dlm... */
+ BUG_ON(rm->rm_used >= osb->max_slots);
+
+ rm->rm_entries[rm->rm_used] = node_num;
+ rm->rm_used++;
+ spin_unlock(&osb->osb_lock);
+
+ return 0;
+}
+
+static void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
+ unsigned int node_num)
+{
+ int i;
+ struct ocfs2_recovery_map *rm = osb->recovery_map;
+
+ spin_lock(&osb->osb_lock);
+
+ for (i = 0; i < rm->rm_used; i++) {
+ if (rm->rm_entries[i] == node_num)
+ break;
+ }
+
+ if (i < rm->rm_used) {
+ /* XXX: be careful with the pointer math */
+ memmove(&(rm->rm_entries[i]), &(rm->rm_entries[i + 1]),
+ (rm->rm_used - i - 1) * sizeof(unsigned int));
+ rm->rm_used--;
+ }
+
+ spin_unlock(&osb->osb_lock);
+}
+
static int ocfs2_commit_cache(struct ocfs2_super *osb)
{
int status = 0;
@@ -650,6 +781,23 @@ bail:
return status;
}

+static int ocfs2_recovery_completed(struct ocfs2_super *osb)
+{
+ int empty;
+ struct ocfs2_recovery_map *rm = osb->recovery_map;
+
+ spin_lock(&osb->osb_lock);
+ empty = (rm->rm_used == 0);
+ spin_unlock(&osb->osb_lock);
+
+ return empty;
+}
+
+void ocfs2_wait_for_recovery(struct ocfs2_super *osb)
+{
+ wait_event(osb->recovery_event, ocfs2_recovery_completed(osb));
+}
+
/*
* JBD Might read a cached version of another nodes journal file. We
* don't want this as this file changes often and we get no
@@ -848,6 +996,7 @@ static int __ocfs2_recovery_thread(void *arg)
{
int status, node_num;
struct ocfs2_super *osb = arg;
+ struct ocfs2_recovery_map *rm = osb->recovery_map;

mlog_entry_void();

@@ -863,26 +1012,29 @@ restart:
goto bail;
}

- while(!ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
- node_num = ocfs2_node_map_first_set_bit(osb,
- &osb->recovery_map);
- if (node_num == O2NM_INVALID_NODE_NUM) {
- mlog(0, "Out of nodes to recover.\n");
- break;
- }
+ spin_lock(&osb->osb_lock);
+ while (rm->rm_used) {
+ /* It's always safe to remove entry zero, as we won't
+ * clear it until ocfs2_recover_node() has succeeded. */
+ node_num = rm->rm_entries[0];
+ spin_unlock(&osb->osb_lock);

status = ocfs2_recover_node(osb, node_num);
- if (status < 0) {
+ if (!status) {
+ ocfs2_recovery_map_clear(osb, node_num);
+ } else {
mlog(ML_ERROR,
"Error %d recovering node %d on device (%u,%u)!\n",
status, node_num,
MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
mlog(ML_ERROR, "Volume requires unmount.\n");
- continue;
}

- ocfs2_recovery_map_clear(osb, node_num);
+ spin_lock(&osb->osb_lock);
}
+ spin_unlock(&osb->osb_lock);
+ mlog(0, "All nodes recovered\n");
+
ocfs2_super_unlock(osb, 1);

/* We always run recovery on our own orphan dir - the dead
@@ -893,8 +1045,7 @@ restart:

bail:
mutex_lock(&osb->recovery_lock);
- if (!status &&
- !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
+ if (!status && !ocfs2_recovery_completed(osb)) {
mutex_unlock(&osb->recovery_lock);
goto restart;
}
@@ -924,8 +1075,8 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)

/* People waiting on recovery will wait on
* the recovery map to empty. */
- if (!ocfs2_recovery_map_set(osb, node_num))
- mlog(0, "node %d already be in recovery.\n", node_num);
+ if (ocfs2_recovery_map_set(osb, node_num))
+ mlog(0, "node %d already in recovery map.\n", node_num);

mlog(0, "starting recovery thread...\n");

@@ -1197,7 +1348,7 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
if (status == -ENOENT)
continue;

- if (ocfs2_node_map_test_bit(osb, &osb->recovery_map, node_num))
+ if (__ocfs2_recovery_map_test(osb, node_num))
continue;
spin_unlock(&osb->osb_lock);

diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 220f3e8..db82be2 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -134,6 +134,10 @@ static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,

/* Exported only for the journal struct init code in super.c. Do not call. */
void ocfs2_complete_recovery(struct work_struct *work);
+void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
+
+int ocfs2_recovery_init(struct ocfs2_super *osb);
+void ocfs2_recovery_exit(struct ocfs2_super *osb);

/*
* Journal Control:
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index ee3f675..c6ed8c3 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -180,6 +180,7 @@ enum ocfs2_mount_options

struct ocfs2_journal;
struct ocfs2_slot_info;
+struct ocfs2_recovery_map;
struct ocfs2_super
{
struct task_struct *commit_task;
@@ -191,7 +192,6 @@ struct ocfs2_super
struct ocfs2_slot_info *slot_info;

spinlock_t node_map_lock;
- struct ocfs2_node_map recovery_map;

u64 root_blkno;
u64 system_dir_blkno;
@@ -226,6 +226,7 @@ struct ocfs2_super

atomic_t vol_state;
struct mutex recovery_lock;
+ struct ocfs2_recovery_map *recovery_map;
struct task_struct *recovery_thread_task;
int disable_recovery;
wait_queue_head_t checkpoint_event;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index fad37af..1a4c7c7 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1224,15 +1224,6 @@ leave:
return status;
}

-/* we can't grab the goofy sem lock from inside wait_event, so we use
- * memory barriers to make sure that we'll see the null task before
- * being woken up */
-static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)
-{
- mb();
- return osb->recovery_thread_task != NULL;
-}
-
static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
{
int tmp;
@@ -1249,17 +1240,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)

ocfs2_truncate_log_shutdown(osb);

- /* disable any new recovery threads and wait for any currently
- * running ones to exit. Do this before setting the vol_state. */
- mutex_lock(&osb->recovery_lock);
- osb->disable_recovery = 1;
- mutex_unlock(&osb->recovery_lock);
- wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
-
- /* At this point, we know that no more recovery threads can be
- * launched, so wait for any recovery completion work to
- * complete. */
- flush_workqueue(ocfs2_wq);
+ /* This will disable recovery and flush any recovery work. */
+ ocfs2_recovery_exit(osb);

ocfs2_journal_shutdown(osb);

@@ -1368,7 +1350,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
osb->s_sectsize_bits = blksize_bits(sector_size);
BUG_ON(!osb->s_sectsize_bits);

- init_waitqueue_head(&osb->recovery_event);
spin_lock_init(&osb->dc_task_lock);
init_waitqueue_head(&osb->dc_event);
osb->dc_work_sequence = 0;
@@ -1388,10 +1369,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));

- mutex_init(&osb->recovery_lock);
-
- osb->disable_recovery = 0;
- osb->recovery_thread_task = NULL;
+ status = ocfs2_recovery_init(osb);
+ if (status) {
+ mlog(ML_ERROR, "Unable to initialize recovery state\n");
+ mlog_errno(status);
+ goto bail;
+ }

init_waitqueue_head(&osb->checkpoint_event);
atomic_set(&osb->needs_checkpoint, 0);
--
1.5.3.8

2008-03-05 22:59:51

by Joel Becker

[permalink] [raw]
Subject: [PATCH 6/7] ocfs2: Define the contents of the slot_map file.

The slot map file is merely an array of __le16. Wrap it in a structure for
cleaner reference.

Signed-off-by: Joel Becker <[email protected]>
---
fs/ocfs2/ocfs2_fs.h | 12 ++++++++++++
fs/ocfs2/slot_map.c | 15 ++++++++-------
2 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 3633edd..3299116 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -475,6 +475,18 @@ struct ocfs2_extent_block
};

/*
+ * On disk slot map for OCFS2. This defines the contents of the "slot_map"
+ * system file.
+ */
+struct ocfs2_slot_map {
+/*00*/ __le16 sm_slots[0];
+/*
+ * Actual on-disk size is one block. OCFS2_MAX_SLOTS is 255,
+ * 255 * sizeof(__le16) == 512B, within the 512B block minimum blocksize.
+ */
+};
+
+/*
* On disk superblock for OCFS2
* Note that it is contained inside an ocfs2_dinode, so all offsets
* are relative to the start of ocfs2_dinode.id2.
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 65a61bf..e7e7a74 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -85,17 +85,17 @@ static void ocfs2_set_slot(struct ocfs2_slot_info *si,
static void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
{
int i;
- __le16 *disk_info;
+ struct ocfs2_slot_map *sm;

/* we don't read the slot block here as ocfs2_super_lock
* should've made sure we have the most recent copy. */
- disk_info = (__le16 *) si->si_bh[0]->b_data;
+ sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data;

for (i = 0; i < si->si_num_slots; i++) {
- if (le16_to_cpu(disk_info[i]) == (u16)OCFS2_INVALID_SLOT)
+ if (le16_to_cpu(sm->sm_slots[i]) == (u16)OCFS2_INVALID_SLOT)
ocfs2_invalidate_slot(si, i);
else
- ocfs2_set_slot(si, i, le16_to_cpu(disk_info[i]));
+ ocfs2_set_slot(si, i, le16_to_cpu(sm->sm_slots[i]));
}
}

@@ -135,15 +135,16 @@ static int ocfs2_update_disk_slots(struct ocfs2_super *osb,
struct ocfs2_slot_info *si)
{
int status, i;
- __le16 *disk_info = (__le16 *) si->si_bh[0]->b_data;
+ struct ocfs2_slot_map *sm;

spin_lock(&osb->osb_lock);
+ sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data;
for (i = 0; i < si->si_num_slots; i++) {
if (si->si_slots[i].sl_valid)
- disk_info[i] =
+ sm->sm_slots[i] =
cpu_to_le16(si->si_slots[i].sl_node_num);
else
- disk_info[i] = cpu_to_le16(OCFS2_INVALID_SLOT);
+ sm->sm_slots[i] = cpu_to_le16(OCFS2_INVALID_SLOT);
}
spin_unlock(&osb->osb_lock);

--
1.5.3.8