2011-08-10 21:11:08

by Boaz Harrosh

[permalink] [raw]
Subject: [PATCHSET 0/4] exofs & pnfsd-exofs assorted changes


Submitted are, a changes to upstream exofs, and some proper fixes
to pnfsd-exofs.

the patches:
[PATCH 1/4] exofs: Remove unused data_map member from exofs_sb_info

This patch will be put in linux-next for the 3.2 merge window

[PATCH 2/4] SQUASHME: pnfsd-exofs: Convert to ORE (2)
[PATCH 3/4] ore: Make ore_calc_stripe_info EXPORT_SYMBOL
[PATCH 4/4] pnfsd-exofs: Serve out a single group layout at a time

These patches are for the pnfsd tree, including the ore
one which has users only in the pnfsd tree for now.

Thanks
Boaz



2011-08-10 21:18:58

by Boaz Harrosh

[permalink] [raw]
Subject: [PATCH 4/4] pnfsd-exofs: Serve out a single group layout at a time


The number of devices in a system can get big real fast.
Just last week we tested with a x64 osd system. The layout
buffer sent from the pnfs client has space for about 21
components.

Serve out a single group segment at a time, and only send
a group-full of devices. Which is usually not bigger then 8
or 9.

Signed-off-by: Boaz Harrosh <[email protected]>
---
fs/exofs/export.c | 32 +++++++++++++++++++++++++-------
1 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/fs/exofs/export.c b/fs/exofs/export.c
index 10b9adb..5d8333c 100644
--- a/fs/exofs/export.c
+++ b/fs/exofs/export.c
@@ -85,6 +85,15 @@ void ore_layout_2_pnfs_layout(struct pnfs_osd_layout *pl,
pl->olo_map.odm_raid_algorithm = ol->raid_algorithm;
}

+static void _align_io(struct ore_layout *layout, u64 *offset, u64 *length)
+{
+ u64 stripe_size = layout->group_width * layout->stripe_unit;
+ u64 group_size = stripe_size * layout->group_depth;
+
+ *offset = div64_u64(*offset, group_size) * group_size;
+ *length = group_size;
+}
+
static enum nfsstat4 exofs_layout_get(
struct inode *inode,
struct exp_xdr_stream *xdr,
@@ -93,16 +102,24 @@ static enum nfsstat4 exofs_layout_get(
{
struct exofs_i_info *oi = exofs_i(inode);
struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+ struct ore_striping_info si;
struct pnfs_osd_layout layout;
__be32 *start;
unsigned i;
bool in_recall;
enum nfsstat4 nfserr;

- res->lg_seg.offset = 0;
- res->lg_seg.length = NFS4_MAX_UINT64;
+ EXOFS_DBGMSG("(0x%lx) REQUESTED offset=0x%llx len=0x%llx iomod=0x%x\n",
+ inode->i_ino, res->lg_seg.offset,
+ res->lg_seg.length, res->lg_seg.iomode);
+
+ _align_io(&sbi->layout, &res->lg_seg.offset, &res->lg_seg.length);
res->lg_seg.iomode = IOMODE_RW;
- res->lg_return_on_close = true; /* TODO: unused but will be soon */
+ res->lg_return_on_close = true;
+
+ EXOFS_DBGMSG("(0x%lx) RETURNED offset=0x%llx len=0x%llx iomod=0x%x\n",
+ inode->i_ino, res->lg_seg.offset,
+ res->lg_seg.length, res->lg_seg.iomode);

/* skip opaque size, will be filled-in later */
start = exp_xdr_reserve_qwords(xdr, 1);
@@ -114,15 +131,16 @@ static enum nfsstat4 exofs_layout_get(
/* Fill in a pnfs_osd_layout struct */
ore_layout_2_pnfs_layout(&layout, &sbi->layout);

- layout.olo_comps_index = 0;
- layout.olo_num_comps = layout.olo_map.odm_num_comps;
+ ore_calc_stripe_info(&sbi->layout, res->lg_seg.offset, &si);
+ layout.olo_comps_index = si.dev;
+ layout.olo_num_comps = sbi->layout.group_width * sbi->layout.mirrors_p1;

nfserr = pnfs_osd_xdr_encode_layout_hdr(xdr, &layout);
if (unlikely(nfserr))
goto out;

/* Encode layout components */
- for (i = 0; i < layout.olo_num_comps; i++) {
+ for (i = si.dev; i < si.dev + layout.olo_num_comps; i++) {
struct pnfs_osd_object_cred cred;
unsigned sbi_dev = oi->comps.ods - sbi->comps.ods + i;

@@ -145,7 +163,7 @@ static enum nfsstat4 exofs_layout_get(
if (unlikely(nfserr)) {
EXOFS_DBGMSG("(0x%lx) nfserr=%u total=%u encoded=%u\n",
inode->i_ino, nfserr, layout.olo_num_comps,
- i - 1);
+ i - si.dev);
goto out;
}
}
--
1.7.6



2011-08-10 21:16:27

by Boaz Harrosh

[permalink] [raw]
Subject: [PATCH 2/4] SQUASHME: pnfsd-exofs: Convert to ORE (2)


This is on top of Benny's conversion (SQUASHME):
pnfsd-exofs: convert to v3.1 ORE

This patch does 3 things:
1. Most of the code is doing the conversion to the new ORE code.

2. Upstream exofs patch has removed the sbi->data_map
member see:
exofs: Remove unused data_map member from exofs_sb_info

3. A *creds variable was set to NULL and kfreed at end, which is
a fallout of some passed, "Convert osd XDR layout encoding"
patch.

Signed-off-by: Boaz Harrosh <[email protected]>
---
fs/exofs/export.c | 58 +++++++++++++++++++++++++++-------------------------
1 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/fs/exofs/export.c b/fs/exofs/export.c
index 07bc5e2..10b9adb 100644
--- a/fs/exofs/export.c
+++ b/fs/exofs/export.c
@@ -73,11 +73,16 @@ err:
return status;
}

-static unsigned exofs_layout_od_id(struct ore_layout *layout,
- struct ore_components *comps,
- osd_id obj_no, unsigned layout_index)
+void ore_layout_2_pnfs_layout(struct pnfs_osd_layout *pl,
+ const struct ore_layout *ol)
{
- return (layout_index + obj_no * layout->mirrors_p1) % comps->numdevs;
+ pl->olo_map.odm_num_comps = ol->group_width * ol->mirrors_p1 *
+ ol->group_count;
+ pl->olo_map.odm_stripe_unit = ol->stripe_unit;
+ pl->olo_map.odm_group_width = ol->group_width;
+ pl->olo_map.odm_group_depth = ol->group_depth;
+ pl->olo_map.odm_mirror_cnt = ol->mirrors_p1 - 1;
+ pl->olo_map.odm_raid_algorithm = ol->raid_algorithm;
}

static enum nfsstat4 exofs_layout_get(
@@ -88,9 +93,6 @@ static enum nfsstat4 exofs_layout_get(
{
struct exofs_i_info *oi = exofs_i(inode);
struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
- struct ore_layout *el = &sbi->layout;
- struct ore_components *ec = &sbi->comps;
- struct pnfs_osd_object_cred *creds = NULL;
struct pnfs_osd_layout layout;
__be32 *start;
unsigned i;
@@ -110,29 +112,25 @@ static enum nfsstat4 exofs_layout_get(
}

/* Fill in a pnfs_osd_layout struct */
- layout.olo_map = sbi->data_map;
+ ore_layout_2_pnfs_layout(&layout, &sbi->layout);
+
layout.olo_comps_index = 0;
- layout.olo_num_comps = ec->numdevs;
- layout.olo_comps = creds;
+ layout.olo_num_comps = layout.olo_map.odm_num_comps;

nfserr = pnfs_osd_xdr_encode_layout_hdr(xdr, &layout);
if (unlikely(nfserr))
goto out;

/* Encode layout components */
- for (i = 0; i < ec->numdevs; i++) {
+ for (i = 0; i < layout.olo_num_comps; i++) {
struct pnfs_osd_object_cred cred;
- struct osd_obj_id oid = {
- .partition = sbi->one_comp.obj.partition,
- .id = exofs_oi_objno(oi)
- };
- unsigned dev = exofs_layout_od_id(el, ec, oid.id, i);
+ unsigned sbi_dev = oi->comps.ods - sbi->comps.ods + i;

set_dev_id(&cred.oc_object_id.oid_device_id, args->lg_sbid,
- dev);
- cred.oc_object_id.oid_partition_id = oid.partition;
- cred.oc_object_id.oid_object_id = oid.id;
- cred.oc_osd_version = osd_dev_is_ver1(ec->ods[dev]) ?
+ sbi_dev);
+ cred.oc_object_id.oid_partition_id = oi->one_comp.obj.partition;
+ cred.oc_object_id.oid_object_id = oi->one_comp.obj.id;
+ cred.oc_osd_version = osd_dev_is_ver1(oi->comps.ods[i]) ?
PNFS_OSD_VERSION_1 :
PNFS_OSD_VERSION_2;
cred.oc_cap_key_sec = PNFS_OSD_CAP_KEY_SEC_NONE;
@@ -141,10 +139,15 @@ static enum nfsstat4 exofs_layout_get(
cred.oc_cap_key.cred = NULL;

cred.oc_cap.cred_len = OSD_CAP_LEN;
- exofs_make_credential(cred.oc_cap.cred, &oid);
+ cred.oc_cap.cred = oi->one_comp.cred;
+
nfserr = pnfs_osd_xdr_encode_layout_cred(xdr, &cred);
- if (unlikely(nfserr))
+ if (unlikely(nfserr)) {
+ EXOFS_DBGMSG("(0x%lx) nfserr=%u total=%u encoded=%u\n",
+ inode->i_ino, nfserr, layout.olo_num_comps,
+ i - 1);
goto out;
+ }
}

exp_xdr_encode_opaque_len(start, xdr->p);
@@ -160,9 +163,9 @@ static enum nfsstat4 exofs_layout_get(
spin_unlock(&oi->i_layout_lock);

out:
- kfree(creds);
- EXOFS_DBGMSG("(0x%lx) nfserr=%u xdr_bytes=%zu\n",
- inode->i_ino, nfserr, exp_xdr_qbytes(xdr->p - start));
+ if (unlikely(nfserr))
+ EXOFS_DBGMSG("(0x%lx) nfserr=%u xdr_bytes=%zu\n",
+ inode->i_ino, nfserr, exp_xdr_qbytes(xdr->p - start));
return nfserr;
}

@@ -288,7 +291,6 @@ int exofs_get_device_info(struct super_block *sb, struct exp_xdr_stream *xdr,
const struct nfsd4_pnfs_deviceid *devid)
{
struct exofs_sb_info *sbi = sb->s_fs_info;
- struct ore_components *ec = &sbi->comps;
struct pnfs_osd_deviceaddr devaddr;
const struct osd_dev_info *odi;
u64 devno = devid->devid;
@@ -297,13 +299,13 @@ int exofs_get_device_info(struct super_block *sb, struct exp_xdr_stream *xdr,

memset(&devaddr, 0, sizeof(devaddr));

- if (unlikely(devno >= ec->numdevs)) {
+ if (unlikely(devno >= sbi->comps.numdevs)) {
EXOFS_DBGMSG("Error: Device((%llx,%llx) does not exist\n",
devid->sbid, devno);
return -ENODEV;
}

- odi = osduld_device_info(ec->ods[devno]);
+ odi = osduld_device_info(sbi->comps.ods[devno]);

devaddr.oda_systemid.len = odi->systemid_len;
devaddr.oda_systemid.data = (void *)odi->systemid; /* !const cast */
--
1.7.6



2011-08-11 12:19:18

by Benny Halevy

[permalink] [raw]
Subject: Re: [PATCHSET 0/4] exofs & pnfsd-exofs assorted changes

On 2011-08-11 00:10, Boaz Harrosh wrote:
>
> Submitted are, a changes to upstream exofs, and some proper fixes
> to pnfsd-exofs.
>
> the patches:
> [PATCH 1/4] exofs: Remove unused data_map member from exofs_sb_info
>
> This patch will be put in linux-next for the 3.2 merge window
>
> [PATCH 2/4] SQUASHME: pnfsd-exofs: Convert to ORE (2)
> [PATCH 3/4] ore: Make ore_calc_stripe_info EXPORT_SYMBOL
> [PATCH 4/4] pnfsd-exofs: Serve out a single group layout at a time
>
> These patches are for the pnfsd tree, including the ore
> one which has users only in the pnfsd tree for now.
>
> Thanks
> Boaz
>

Merged into pnfsd-exofs. Thanks!

Benny

P.S. I don't think my Panasas email address is still active
no need to send stuff there...

2011-08-10 21:17:35

by Boaz Harrosh

[permalink] [raw]
Subject: [PATCH 3/4] ore: Make ore_calc_stripe_info EXPORT_SYMBOL


The raid math calculations are needed by the layout-export
facility.

Signed-off-by: Boaz Harrosh <[email protected]>
---
fs/exofs/ore.c | 25 +++++++++----------------
include/scsi/osd_ore.h | 10 ++++++++++
2 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 25305af..a2c09e7 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -287,16 +287,8 @@ EXPORT_SYMBOL(ore_check_io);
*
* O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit
*/
-struct _striping_info {
- u64 obj_offset;
- u64 group_length;
- u64 M; /* for truncate */
- unsigned dev;
- unsigned unit_off;
-};
-
-static void _calc_stripe_info(struct ore_layout *layout, u64 file_offset,
- struct _striping_info *si)
+void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
+ struct ore_striping_info *si)
{
u32 stripe_unit = layout->stripe_unit;
u32 group_width = layout->group_width;
@@ -329,6 +321,7 @@ static void _calc_stripe_info(struct ore_layout *layout, u64 file_offset,
si->group_length = T - H;
si->M = M;
}
+EXPORT_SYMBOL(ore_calc_stripe_info);

static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
unsigned pgbase, struct ore_per_dev_state *per_dev,
@@ -375,7 +368,7 @@ static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
}

static int _prepare_one_group(struct ore_io_state *ios, u64 length,
- struct _striping_info *si)
+ struct ore_striping_info *si)
{
unsigned stripe_unit = ios->layout->stripe_unit;
unsigned mirrors_p1 = ios->layout->mirrors_p1;
@@ -434,14 +427,14 @@ static int _prepare_for_striping(struct ore_io_state *ios)
{
u64 length = ios->length;
u64 offset = ios->offset;
- struct _striping_info si;
+ struct ore_striping_info si;
int ret = 0;

if (!ios->pages) {
if (ios->kern_buff) {
struct ore_per_dev_state *per_dev = &ios->per_dev[0];

- _calc_stripe_info(ios->layout, ios->offset, &si);
+ ore_calc_stripe_info(ios->layout, ios->offset, &si);
per_dev->offset = si.obj_offset;
per_dev->dev = si.dev;

@@ -455,7 +448,7 @@ static int _prepare_for_striping(struct ore_io_state *ios)
}

while (length) {
- _calc_stripe_info(ios->layout, offset, &si);
+ ore_calc_stripe_info(ios->layout, offset, &si);

if (length < si.group_length)
si.group_length = length;
@@ -744,7 +737,7 @@ static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp,
}

struct _trunc_info {
- struct _striping_info si;
+ struct ore_striping_info si;
u64 prev_group_obj_off;
u64 next_group_obj_off;

@@ -758,7 +751,7 @@ void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
{
unsigned stripe_unit = layout->stripe_unit;

- _calc_stripe_info(layout, file_offset, &ti->si);
+ ore_calc_stripe_info(layout, file_offset, &ti->si);

ti->prev_group_obj_off = ti->si.M * stripe_unit;
ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0;
diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h
index 0ac4931..4779ccc 100644
--- a/include/scsi/osd_ore.h
+++ b/include/scsi/osd_ore.h
@@ -96,6 +96,13 @@ struct ore_io_state {
unsigned dev;
} per_dev[];
};
+struct ore_striping_info {
+ u64 obj_offset;
+ u64 group_length;
+ u64 M; /* for truncate */
+ unsigned dev;
+ unsigned unit_off;
+};

static inline unsigned ore_io_state_size(unsigned numdevs)
{
@@ -104,6 +111,9 @@ static inline unsigned ore_io_state_size(unsigned numdevs)
}

/* ore.c */
+void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
+ struct ore_striping_info *si);
+
int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps,
bool is_reading, u64 offset, u64 length,
struct ore_io_state **ios);
--
1.7.6



2011-08-10 21:15:09

by Boaz Harrosh

[permalink] [raw]
Subject: [PATCH 1/4] exofs: Remove unused data_map member from exofs_sb_info


The struct pnfs_osd_data_map data_map member of exofs_sb_info was
never used after mount. In fact all it's members were duplicated
by the ore_layout structure. So just remove the duplicated information.

Also removed some stupid, but perfectly supported, restrictions on
layout parameters. The case where num_devices is not divisible by
mirror_count+1 is perfectly fine since the rotating device view
will eventually use all the devices it can get.

Signed-off-by: Boaz Harrosh <[email protected]>
---
fs/exofs/exofs.h | 3 --
fs/exofs/super.c | 57 ++++++++++++++++++-----------------------------
include/scsi/osd_ore.h | 2 +
3 files changed, 24 insertions(+), 38 deletions(-)

diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 21b1c71..474d99f 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -66,9 +66,6 @@ struct exofs_sb_info {
u32 s_next_generation; /* next gen # to use */
atomic_t s_curr_pending; /* number of pending commands */

- struct pnfs_osd_data_map data_map; /* Default raid to use
- * FIXME: Needed ?
- */
struct ore_layout layout; /* Default files layout */
struct ore_comp one_comp; /* id & cred of partition id=0*/
struct ore_components comps; /* comps for the partition */
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index fbea138..7a45e78 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -481,64 +481,51 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
{
u64 stripe_length;

- sbi->data_map.odm_num_comps =
- le32_to_cpu(dt->dt_data_map.cb_num_comps);
- sbi->data_map.odm_stripe_unit =
+ sbi->layout.stripe_unit =
le64_to_cpu(dt->dt_data_map.cb_stripe_unit);
- sbi->data_map.odm_group_width =
+ sbi->layout.group_width =
le32_to_cpu(dt->dt_data_map.cb_group_width);
- sbi->data_map.odm_group_depth =
+ sbi->layout.group_depth =
le32_to_cpu(dt->dt_data_map.cb_group_depth);
- sbi->data_map.odm_mirror_cnt =
- le32_to_cpu(dt->dt_data_map.cb_mirror_cnt);
- sbi->data_map.odm_raid_algorithm =
+ sbi->layout.mirrors_p1 =
+ le32_to_cpu(dt->dt_data_map.cb_mirror_cnt) + 1;
+ sbi->layout.raid_algorithm =
le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);

/* FIXME: Only raid0 for now. if not so, do not mount */
- if (sbi->data_map.odm_num_comps != numdevs) {
- EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n",
- sbi->data_map.odm_num_comps, numdevs);
- return -EINVAL;
- }
- if (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) {
+ if (sbi->layout.raid_algorithm != PNFS_OSD_RAID_0) {
EXOFS_ERR("Only RAID_0 for now\n");
return -EINVAL;
}
- if (0 != (numdevs % (sbi->data_map.odm_mirror_cnt + 1))) {
- EXOFS_ERR("Data Map wrong, numdevs=%d mirrors=%d\n",
- numdevs, sbi->data_map.odm_mirror_cnt);
+ if (numdevs < (sbi->layout.group_width * sbi->layout.mirrors_p1)) {
+ EXOFS_ERR("Data Map wrong, "
+ "numdevs=%d < group_width=%d * mirrors=%d\n",
+ numdevs, sbi->layout.group_width,
+ sbi->layout.mirrors_p1);
return -EINVAL;
}

- if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) {
+ if (0 != (sbi->layout.stripe_unit & ~PAGE_MASK)) {
EXOFS_ERR("Stripe Unit(0x%llx)"
" must be Multples of PAGE_SIZE(0x%lx)\n",
- _LLU(sbi->data_map.odm_stripe_unit), PAGE_SIZE);
+ _LLU(sbi->layout.stripe_unit), PAGE_SIZE);
return -EINVAL;
}

- sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit;
- sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1;
-
- if (sbi->data_map.odm_group_width) {
- sbi->layout.group_width = sbi->data_map.odm_group_width;
- sbi->layout.group_depth = sbi->data_map.odm_group_depth;
+ if (sbi->layout.group_width) {
if (!sbi->layout.group_depth) {
EXOFS_ERR("group_depth == 0 && group_width != 0\n");
return -EINVAL;
}
- sbi->layout.group_count = sbi->data_map.odm_num_comps /
- sbi->layout.mirrors_p1 /
- sbi->data_map.odm_group_width;
+ sbi->layout.group_count = numdevs / sbi->layout.mirrors_p1 /
+ sbi->layout.group_width;
} else {
- if (sbi->data_map.odm_group_depth) {
+ if (sbi->layout.group_depth) {
printk(KERN_NOTICE "Warning: group_depth ignored "
- "group_width == 0 && group_depth == %d\n",
- sbi->data_map.odm_group_depth);
- sbi->data_map.odm_group_depth = 0;
+ "group_width == 0 && group_depth == %lld\n",
+ _LLU(sbi->layout.group_depth));
}
- sbi->layout.group_width = sbi->data_map.odm_num_comps /
- sbi->layout.mirrors_p1;
+ sbi->layout.group_width = numdevs / sbi->layout.mirrors_p1;
sbi->layout.group_depth = -1;
sbi->layout.group_count = 1;
}
@@ -558,7 +545,7 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
sbi->layout.group_width,
_LLU(sbi->layout.group_depth),
sbi->layout.mirrors_p1,
- sbi->data_map.odm_raid_algorithm);
+ sbi->layout.raid_algorithm);
return 0;
}

diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h
index c5c5e00..0ac4931 100644
--- a/include/scsi/osd_ore.h
+++ b/include/scsi/osd_ore.h
@@ -34,6 +34,8 @@ struct ore_comp {

struct ore_layout {
/* Our way of looking at the data_map */
+ enum pnfs_osd_raid_algorithm4
+ raid_algorithm;
unsigned stripe_unit;
unsigned mirrors_p1;

--
1.7.6



2011-08-11 18:18:56

by Boaz Harrosh

[permalink] [raw]
Subject: Re: [PATCHSET 0/4] exofs & pnfsd-exofs assorted changes

On 08/11/2011 05:19 AM, Benny Halevy wrote:
> Benny
>
> P.S. I don't think my Panasas email address is still active
> no need to send stuff there...

Yes! Sorry about that. my mailer keeps popping it out
and I forget to notice. I'll do some cleaning

Thanks
Boaz