2011-10-04 10:24:57

by Boaz Harrosh

[permalink] [raw]
Subject: [PATCHSET 00/19] objlayout: Move to ORE


Submitted is the move of the objects-layout-driver to the ORE
(Objects Raid Engine). Which after this patchset will be
used by both the exofs file system and the objlayoutdriver.
(ore.ko is its own library since last Kernel)

This code is intended for the 3.2 Kernel and is already
collecting dust in linux-next. (Though the latest bits from today)

End of this week I will post the RAID5 support to ORE and both
exofs and objlayoutdriver. Also meant for 3.2

Trond Hi
The First 12 patches are to the ore and exofs to make them
compatible with objlayoutdriver. The pnfs-obj patches are
dependent on the first part been present.
We can either submit them all through your tree. Or alternatively
You can send your ACK-by: on the last 7 and I can submit them
to Linus through my tree. Which ever you prefer.

Benny Hi
I please need your Review-by: to [PATCH 13/19] and [PATCH 14/19]
which change code behaviour. The rest of the patches are just conversions
which in theory do not add or change any functionality. (Unless there
are bugs, but that's not intended)

Also, With these patches panfs-layout-driver is totally broken.
Please just remove it once you rebase on these patches. With the RAID5
support it is no longer needed. I've started testing with PanFS export
through the STD objlayoutdriver, hope to finish this week. (We always
have the old versions)

Also tomorrow I will send the needed patch for pnfsd-exofs branch
that works with these changes.

These are the list off patches:

[PATCH 01/19] exofs: Rename struct ore_components comps => oc
[PATCH 02/19] exofs: Remove unused data_map member from exofs_sb_info
[PATCH 03/19] ore: Make ore_striping_info and ore_calc_stripe_info public
[PATCH 04/19] ore/exofs: Change the type of the devices array (API change)
[PATCH 05/19] ore: Only IO one group at a time (API change)
[PATCH 06/19] ore: cleanup: Embed an ore_striping_info inside ore_io_state
[PATCH 07/19] ore: Remove check for ios->kern_buff in _prepare_for_striping to later
[PATCH 08/19] exofs: Support for short read/writes
[PATCH 09/19] ore: Support for short read/writes
[PATCH 10/19] ore: Support for partial component table
[PATCH 11/19] ore/exofs: Define new ore_verify_layout
[PATCH 12/19] ore/exofs: Change ore_check_io API

Up to here are the changes need to ore and exofs so the ore
can be used by the objlayoutdriver. Any review is welcome.
Same API will be used for RAID4/5/6 support.

[PATCH 13/19] pnfs-obj: Remove redundant EOF from objlayout_io_state
[PATCH 14/19] pnfs-obj: Return PNFS_NOT_ATTEMPTED in case of read/write_pagelist

Benny please review these two. They are independent of the ORE
conversion. I think the [PATCH 14/19] might not be enough and
Error handling needs "more", but the needed changes are their
own patch, to come later.

[PATCH 15/19] pnfs-obj: Get rid of objlayout_{alloc,free}_io_state
[PATCH 16/19] pnfs-obj: Rename objlayout_io_state => objlayout_io_res
[PATCH 17/19] pnfs-obj: move to ore 01: ore_layout & ore_components
[PATCH 18/19] pnfs-obj: move to ore 02: move to ORE
[PATCH 19/19] pnfs-obj: move to ore 03: Remove old raid engine

These 5 stage the move to the ore. With these patches I'm
able to pass all the tests I passed with the old code.
Only now with more then 500 lines of code less.

Cheers
Boaz


2011-10-11 02:33:28

by Benny Halevy

[permalink] [raw]
Subject: Re: [PATCH 17/19] pnfs-obj: move to ore 01: ore_layout & ore_components

On 2011-10-04 06:36, Boaz Harrosh wrote:
> For Ease of reviewing I split the move to ore into 3 parts
> move to ore 01: ore_layout & ore_components
> move to ore 02: move to ORE
> move to ore 03: Remove old raid engine
>
> This patch modifies the objio_lseg, layout-segment level
> and devices and components arrays to use the ORE types.
>
> Though it will be removed soon, also the raid engine
> is modified to actually compile, possibly run, with
> the new types. So it is the same old raid engine but
> with some new ORE types.
>
> For Ease of reviewing, some of the old code is
> "#if 0" but is not removed so the diff command works
> better. The old code will be removed in the 3rd patch.
>
> Signed-off-by: Boaz Harrosh <[email protected]>

Acked-by: Benny Halevy <[email protected]>

Thanks!

> ---
> fs/nfs/objlayout/objio_osd.c | 272 ++++++++++++++++++++----------------------
> 1 files changed, 128 insertions(+), 144 deletions(-)
>
> diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
> index 2347e0a..bd7ec26 100644
> --- a/fs/nfs/objlayout/objio_osd.c
> +++ b/fs/nfs/objlayout/objio_osd.c
> @@ -38,7 +38,7 @@
> */
>
> #include <linux/module.h>
> -#include <scsi/osd_initiator.h>
> +#include <scsi/osd_ore.h>
>
> #include "objlayout.h"
>
> @@ -52,7 +52,7 @@ enum { BIO_MAX_PAGES_KMALLOC =
>
> struct objio_dev_ent {
> struct nfs4_deviceid_node id_node;
> - struct osd_dev *od;
> + struct ore_dev od;
> };
>
> static void
> @@ -60,8 +60,8 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d)
> {
> struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node);
>
> - dprintk("%s: free od=%p\n", __func__, de->od);
> - osduld_put_device(de->od);
> + dprintk("%s: free od=%p\n", __func__, de->od.od);
> + osduld_put_device(de->od.od);
> kfree(de);
> }
>
> @@ -98,12 +98,12 @@ _dev_list_add(const struct nfs_server *nfss,
> nfss->pnfs_curr_ld,
> nfss->nfs_client,
> d_id);
> - de->od = od;
> + de->od.od = od;
>
> d = nfs4_insert_deviceid_node(&de->id_node);
> n = container_of(d, struct objio_dev_ent, id_node);
> if (n != de) {
> - dprintk("%s: Race with other n->od=%p\n", __func__, n->od);
> + dprintk("%s: Race with other n->od=%p\n", __func__, n->od.od);
> objio_free_deviceid_node(&de->id_node);
> de = n;
> }
> @@ -111,28 +111,11 @@ _dev_list_add(const struct nfs_server *nfss,
> return de;
> }
>
> -struct caps_buffers {
> - u8 caps_key[OSD_CRYPTO_KEYID_SIZE];
> - u8 creds[OSD_CAP_LEN];
> -};
> -
> struct objio_segment {
> struct pnfs_layout_segment lseg;
>
> - struct pnfs_osd_object_cred *comps;
> -
> - unsigned mirrors_p1;
> - unsigned stripe_unit;
> - unsigned group_width; /* Data stripe_units without integrity comps */
> - u64 group_depth;
> - unsigned group_count;
> -
> - unsigned max_io_size;
> -
> - unsigned comps_index;
> - unsigned num_comps;
> - /* variable length */
> - struct objio_dev_ent *ods[];
> + struct ore_layout layout;
> + struct ore_components oc;
> };
>
> static inline struct objio_segment *
> @@ -155,7 +138,8 @@ struct objio_state {
> loff_t offset;
> bool sync;
>
> - struct objio_segment *layout;
> + struct ore_layout *layout;
> + struct ore_components *oc;
>
> struct kref kref;
> objio_done_fn done;
> @@ -175,32 +159,33 @@ struct objio_state {
>
> /* Send and wait for a get_device_info of devices in the layout,
> then look them up with the osd_initiator library */
> -static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay,
> - struct objio_segment *objio_seg, unsigned comp,
> - gfp_t gfp_flags)
> +static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
> + struct objio_segment *objio_seg, unsigned c, struct nfs4_deviceid *d_id,
> + gfp_t gfp_flags)
> {
> struct pnfs_osd_deviceaddr *deviceaddr;
> - struct nfs4_deviceid *d_id;
> struct objio_dev_ent *ode;
> struct osd_dev *od;
> struct osd_dev_info odi;
> int err;
>
> - d_id = &objio_seg->comps[comp].oc_object_id.oid_device_id;
> -
> ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id);
> - if (ode)
> - return ode;
> + if (ode) {
> + objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
> + return 0;
> + }
>
> err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags);
> if (unlikely(err)) {
> dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
> __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err);
> - return ERR_PTR(err);
> + return err;
> }
>
> odi.systemid_len = deviceaddr->oda_systemid.len;
> if (odi.systemid_len > sizeof(odi.systemid)) {
> + dprintk("%s: odi.systemid_len > sizeof(systemid=%zd)\n",
> + __func__, sizeof(odi.systemid));
> err = -EINVAL;
> goto out;
> } else if (odi.systemid_len)
> @@ -225,38 +210,15 @@ static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay,
>
> ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od,
> gfp_flags);
> -
> + objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
> + dprintk("Adding new dev_id(%llx:%llx)\n",
> + _DEVID_LO(d_id), _DEVID_HI(d_id));
> out:
> - dprintk("%s: return=%d\n", __func__, err);
> objlayout_put_deviceinfo(deviceaddr);
> - return err ? ERR_PTR(err) : ode;
> -}
> -
> -static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
> - struct objio_segment *objio_seg,
> - gfp_t gfp_flags)
> -{
> - unsigned i;
> - int err;
> -
> - /* lookup all devices */
> - for (i = 0; i < objio_seg->num_comps; i++) {
> - struct objio_dev_ent *ode;
> -
> - ode = _device_lookup(pnfslay, objio_seg, i, gfp_flags);
> - if (unlikely(IS_ERR(ode))) {
> - err = PTR_ERR(ode);
> - goto out;
> - }
> - objio_seg->ods[i] = ode;
> - }
> - err = 0;
> -
> -out:
> - dprintk("%s: return=%d\n", __func__, err);
> return err;
> }
>
> +#if 0
> static int _verify_data_map(struct pnfs_osd_layout *layout)
> {
> struct pnfs_osd_data_map *data_map = &layout->olo_map;
> @@ -296,23 +258,45 @@ static int _verify_data_map(struct pnfs_osd_layout *layout)
>
> return 0;
> }
> +#endif
>
> -static void copy_single_comp(struct pnfs_osd_object_cred *cur_comp,
> - struct pnfs_osd_object_cred *src_comp,
> - struct caps_buffers *caps_p)
> +static void copy_single_comp(struct ore_components *oc, unsigned c,
> + struct pnfs_osd_object_cred *src_comp)
> {
> - WARN_ON(src_comp->oc_cap_key.cred_len > sizeof(caps_p->caps_key));
> - WARN_ON(src_comp->oc_cap.cred_len > sizeof(caps_p->creds));
> + struct ore_comp *ocomp = &oc->comps[c];
>
> - *cur_comp = *src_comp;
> + WARN_ON(src_comp->oc_cap_key.cred_len > 0); /* libosd is NO_SEC only */
> + WARN_ON(src_comp->oc_cap.cred_len > sizeof(ocomp->cred));
>
> - memcpy(caps_p->caps_key, src_comp->oc_cap_key.cred,
> - sizeof(caps_p->caps_key));
> - cur_comp->oc_cap_key.cred = caps_p->caps_key;
> + ocomp->obj.partition = src_comp->oc_object_id.oid_partition_id;
> + ocomp->obj.id = src_comp->oc_object_id.oid_object_id;
>
> - memcpy(caps_p->creds, src_comp->oc_cap.cred,
> - sizeof(caps_p->creds));
> - cur_comp->oc_cap.cred = caps_p->creds;
> + memcpy(ocomp->cred, src_comp->oc_cap.cred, sizeof(ocomp->cred));
> +}
> +
> +int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags,
> + struct objio_segment **pseg)
> +{
> + struct __alloc_objio_segment {
> + struct objio_segment olseg;
> + struct ore_dev *ods[numdevs];
> + struct ore_comp comps[numdevs];
> + } *aolseg;
> +
> + aolseg = kzalloc(sizeof(*aolseg), gfp_flags);
> + if (unlikely(!aolseg)) {
> + dprintk("%s: Faild allocation numdevs=%d size=%zd\n", __func__,
> + numdevs, sizeof(*aolseg));
> + return -ENOMEM;
> + }
> +
> + aolseg->olseg.oc.numdevs = numdevs;
> + aolseg->olseg.oc.single_comp = EC_MULTPLE_COMPS;
> + aolseg->olseg.oc.comps = aolseg->comps;
> + aolseg->olseg.oc.ods = aolseg->ods;
> +
> + *pseg = &aolseg->olseg;
> + return 0;
> }
>
> int objio_alloc_lseg(struct pnfs_layout_segment **outp,
> @@ -324,59 +308,43 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp,
> struct objio_segment *objio_seg;
> struct pnfs_osd_xdr_decode_layout_iter iter;
> struct pnfs_osd_layout layout;
> - struct pnfs_osd_object_cred *cur_comp, src_comp;
> - struct caps_buffers *caps_p;
> + struct pnfs_osd_object_cred src_comp;
> + unsigned cur_comp;
> int err;
>
> err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr);
> if (unlikely(err))
> return err;
>
> - err = _verify_data_map(&layout);
> + err = __alloc_objio_seg(layout.olo_num_comps, gfp_flags, &objio_seg);
> if (unlikely(err))
> return err;
>
> - objio_seg = kzalloc(sizeof(*objio_seg) +
> - sizeof(objio_seg->ods[0]) * layout.olo_num_comps +
> - sizeof(*objio_seg->comps) * layout.olo_num_comps +
> - sizeof(struct caps_buffers) * layout.olo_num_comps,
> - gfp_flags);
> - if (!objio_seg)
> - return -ENOMEM;
> + objio_seg->layout.stripe_unit = layout.olo_map.odm_stripe_unit;
> + objio_seg->layout.group_width = layout.olo_map.odm_group_width;
> + objio_seg->layout.group_depth = layout.olo_map.odm_group_depth;
> + objio_seg->layout.mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
> + objio_seg->layout.raid_algorithm = layout.olo_map.odm_raid_algorithm;
>
> - objio_seg->comps = (void *)(objio_seg->ods + layout.olo_num_comps);
> - cur_comp = objio_seg->comps;
> - caps_p = (void *)(cur_comp + layout.olo_num_comps);
> - while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err))
> - copy_single_comp(cur_comp++, &src_comp, caps_p++);
> + err = ore_verify_layout(layout.olo_map.odm_num_comps,
> + &objio_seg->layout);
> if (unlikely(err))
> goto err;
>
> - objio_seg->num_comps = layout.olo_num_comps;
> - objio_seg->comps_index = layout.olo_comps_index;
> - err = objio_devices_lookup(pnfslay, objio_seg, gfp_flags);
> - if (err)
> - goto err;
> -
> - objio_seg->mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
> - objio_seg->stripe_unit = layout.olo_map.odm_stripe_unit;
> - if (layout.olo_map.odm_group_width) {
> - objio_seg->group_width = layout.olo_map.odm_group_width;
> - objio_seg->group_depth = layout.olo_map.odm_group_depth;
> - objio_seg->group_count = layout.olo_map.odm_num_comps /
> - objio_seg->mirrors_p1 /
> - objio_seg->group_width;
> - } else {
> - objio_seg->group_width = layout.olo_map.odm_num_comps /
> - objio_seg->mirrors_p1;
> - objio_seg->group_depth = -1;
> - objio_seg->group_count = 1;
> + objio_seg->oc.first_dev = layout.olo_comps_index;
> + cur_comp = 0;
> + while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) {
> + copy_single_comp(&objio_seg->oc, cur_comp, &src_comp);
> + err = objio_devices_lookup(pnfslay, objio_seg, cur_comp,
> + &src_comp.oc_object_id.oid_device_id,
> + gfp_flags);
> + if (err)
> + goto err;
> + ++cur_comp;
> }
> -
> - /* Cache this calculation it will hit for every page */
> - objio_seg->max_io_size = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE -
> - objio_seg->stripe_unit) *
> - objio_seg->group_width;
> + /* pnfs_osd_xdr_decode_layout_comp returns false on error */
> + if (unlikely(err))
> + goto err;
>
> *outp = &objio_seg->lseg;
> return 0;
> @@ -393,10 +361,14 @@ void objio_free_lseg(struct pnfs_layout_segment *lseg)
> int i;
> struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
>
> - for (i = 0; i < objio_seg->num_comps; i++) {
> - if (!objio_seg->ods[i])
> + for (i = 0; i < objio_seg->oc.numdevs; i++) {
> + struct ore_dev *od = objio_seg->oc.ods[i];
> + struct objio_dev_ent *ode;
> +
> + if (!od)
> break;
> - nfs4_put_deviceid_node(&objio_seg->ods[i]->id_node);
> + ode = container_of(od, typeof(*ode), od);
> + nfs4_put_deviceid_node(&ode->id_node);
> }
> kfree(objio_seg);
> }
> @@ -411,8 +383,8 @@ objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
> struct objio_state *ios;
> struct __alloc_objio_state {
> struct objio_state objios;
> - struct _objio_per_comp per_dev[objio_seg->num_comps];
> - struct pnfs_osd_ioerr ioerrs[objio_seg->num_comps];
> + struct _objio_per_comp per_dev[objio_seg->oc.numdevs];
> + struct pnfs_osd_ioerr ioerrs[objio_seg->oc.numdevs];
> } *aos;
>
> aos = kzalloc(sizeof(*aos), gfp_flags);
> @@ -421,8 +393,9 @@ objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
>
> ios = &aos->objios;
>
> - ios->layout = objio_seg;
> - objlayout_init_ioerrs(&aos->objios.oir, objio_seg->num_comps,
> + ios->layout = &objio_seg->layout;
> + ios->oc = &objio_seg->oc;
> + objlayout_init_ioerrs(&aos->objios.oir, objio_seg->oc.numdevs,
> aos->ioerrs, rpcdata, pnfs_layout_type);
>
> ios->pages = pages;
> @@ -474,6 +447,27 @@ enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
> }
> }
>
> +static void __on_dev_error(struct objio_state *ios, bool is_write,
> + struct ore_dev *od, unsigned dev_index, enum osd_err_priority oep,
> + u64 dev_offset, u64 dev_len)
> +{
> + struct objio_state *objios = ios->private;
> + struct pnfs_osd_objid pooid;
> + struct objio_dev_ent *ode = container_of(od, typeof(*ode), od);
> + /* FIXME: what to do with more-then-one-group layouts. We need to
> + * translate from ore_io_state index to oc->comps index
> + */
> + unsigned comp = dev_index;
> +
> + pooid.oid_device_id = ode->id_node.deviceid;
> + pooid.oid_partition_id = ios->oc->comps[comp].obj.partition;
> + pooid.oid_object_id = ios->oc->comps[comp].obj.id;
> +
> + objlayout_io_set_result(&objios->oir, comp,
> + &pooid, osd_pri_2_pnfs_err(oep),
> + dev_offset, dev_len, is_write);
> +}
> +
> static void _clear_bio(struct bio *bio)
> {
> struct bio_vec *bv;
> @@ -518,12 +512,9 @@ static int _io_check(struct objio_state *ios, bool is_write)
>
> continue; /* we recovered */
> }
> - objlayout_io_set_result(&ios->oir, i,
> - &ios->layout->comps[i].oc_object_id,
> - osd_pri_2_pnfs_err(osi.osd_err_pri),
> - ios->per_dev[i].offset,
> - ios->per_dev[i].length,
> - is_write);
> + __on_dev_error(ios, is_write, ios->oc->ods[i],
> + ios->per_dev[i].dev, osi.osd_err_pri,
> + ios->per_dev[i].offset, ios->per_dev[i].length);
>
> if (osi.osd_err_pri >= oep) {
> oep = osi.osd_err_pri;
> @@ -558,11 +549,11 @@ static void _io_free(struct objio_state *ios)
>
> struct osd_dev *_io_od(struct objio_state *ios, unsigned dev)
> {
> - unsigned min_dev = ios->layout->comps_index;
> - unsigned max_dev = min_dev + ios->layout->num_comps;
> + unsigned min_dev = ios->oc->first_dev;
> + unsigned max_dev = min_dev + ios->oc->numdevs;
>
> BUG_ON(dev < min_dev || max_dev <= dev);
> - return ios->layout->ods[dev - min_dev]->od;
> + return ios->oc->ods[dev - min_dev]->od;
> }
>
> struct _striping_info {
> @@ -820,12 +811,9 @@ static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
> struct osd_request *or = NULL;
> struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
> unsigned dev = per_dev->dev;
> - struct pnfs_osd_object_cred *cred =
> - &ios->layout->comps[cur_comp];
> - struct osd_obj_id obj = {
> - .partition = cred->oc_object_id.oid_partition_id,
> - .id = cred->oc_object_id.oid_object_id,
> - };
> + struct ore_comp *cred =
> + &ios->oc->comps[cur_comp];
> + struct osd_obj_id obj = cred->obj;
> int ret;
>
> or = osd_start_request(_io_od(ios, dev), GFP_KERNEL);
> @@ -837,7 +825,7 @@ static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
>
> osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length);
>
> - ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
> + ret = osd_finalize_request(or, 0, cred->cred, NULL);
> if (ret) {
> dprintk("%s: Faild to osd_finalize_request() => %d\n",
> __func__, ret);
> @@ -924,12 +912,8 @@ static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
>
> for (; cur_comp < last_comp; ++cur_comp, ++dev) {
> struct osd_request *or = NULL;
> - struct pnfs_osd_object_cred *cred =
> - &ios->layout->comps[cur_comp];
> - struct osd_obj_id obj = {
> - .partition = cred->oc_object_id.oid_partition_id,
> - .id = cred->oc_object_id.oid_object_id,
> - };
> + struct ore_comp *cred = &ios->oc->comps[cur_comp];
> + struct osd_obj_id obj = cred->obj;
> struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
> struct bio *bio;
>
> @@ -964,7 +948,7 @@ static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
>
> osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length);
>
> - ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
> + ret = osd_finalize_request(or, 0, cred->cred, NULL);
> if (ret) {
> dprintk("%s: Faild to osd_finalize_request() => %d\n",
> __func__, ret);
> @@ -1030,7 +1014,7 @@ static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
> return false;
>
> return pgio->pg_count + req->wb_bytes <=
> - OBJIO_LSEG(pgio->pg_lseg)->max_io_size;
> + OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length;
> }
>
> static const struct nfs_pageio_ops objio_pg_read_ops = {

2011-10-11 02:32:47

by Benny Halevy

[permalink] [raw]
Subject: Re: [PATCH 15/19] pnfs-obj: Get rid of objlayout_{alloc,free}_io_state

On 2011-10-04 06:35, Boaz Harrosh wrote:
> This is part of moving objio_osd to use the ORE.
>
> objlayout_io_state had two functions:
> 1. It was used in the error reporting mechanism at layout_return.
> This function is kept intact.
> (Later patch will rename objlayout_io_state => objlayout_io_res)
> 2. Carrier of rw io members into the objio_read/write_paglist API.
> This is removed in this patch.
>
> The {r,w}data received from NFS are passed directly to the
> objio_{read,write}_paglist API. The io_engine is now allocating
> it's own IO state as part of the read/write. The minimal
> functionality that was part of the generic allocation is passed
> to the io_engine.
>
> So part of this patch is rename of:
> ios->ol_state.foo => ios->foo
>
> At objlayout_{read,write}_done an objlayout_io_state is passed that
> denotes the result of the IO. (Hence the later name change).
> If the IO is successful objlayout calls an objio_free_result() API
> immediately (Which for objio_osd causes the release of the io_state).
> If the IO ended in an error it is hanged onto until reported in
> layout_return and is released later through the objio_free_result()
> API. (All this is not new just renamed and cleaned)
>
> Signed-off-by: Boaz Harrosh <[email protected]>

Reviewed-by: Benny Halevy <[email protected]>

Thanks!

> ---
> fs/nfs/objlayout/objio_osd.c | 94 ++++++++++++++++++++++----------
> fs/nfs/objlayout/objlayout.c | 124 +++++++++++-------------------------------
> fs/nfs/objlayout/objlayout.h | 36 ++++++-------
> 3 files changed, 112 insertions(+), 142 deletions(-)
>
> diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
> index 0c7c9ec..48eb91a 100644
> --- a/fs/nfs/objlayout/objio_osd.c
> +++ b/fs/nfs/objlayout/objio_osd.c
> @@ -148,6 +148,13 @@ struct objio_state {
> /* Generic layer */
> struct objlayout_io_state ol_state;
>
> + struct page **pages;
> + unsigned pgbase;
> + unsigned nr_pages;
> + unsigned long count;
> + loff_t offset;
> + bool sync;
> +
> struct objio_segment *layout;
>
> struct kref kref;
> @@ -394,30 +401,43 @@ void objio_free_lseg(struct pnfs_layout_segment *lseg)
> kfree(objio_seg);
> }
>
> -int objio_alloc_io_state(struct pnfs_layout_segment *lseg,
> - struct objlayout_io_state **outp,
> - gfp_t gfp_flags)
> +static int
> +objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
> + struct pnfs_layout_segment *lseg, struct page **pages, unsigned pgbase,
> + loff_t offset, size_t count, void *rpcdata, gfp_t gfp_flags,
> + struct objio_state **outp)
> {
> struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
> struct objio_state *ios;
> - const unsigned first_size = sizeof(*ios) +
> - objio_seg->num_comps * sizeof(ios->per_dev[0]);
> - const unsigned sec_size = objio_seg->num_comps *
> - sizeof(ios->ol_state.ioerrs[0]);
> -
> - ios = kzalloc(first_size + sec_size, gfp_flags);
> - if (unlikely(!ios))
> + struct __alloc_objio_state {
> + struct objio_state objios;
> + struct _objio_per_comp per_dev[objio_seg->num_comps];
> + struct pnfs_osd_ioerr ioerrs[objio_seg->num_comps];
> + } *aos;
> +
> + aos = kzalloc(sizeof(*aos), gfp_flags);
> + if (unlikely(!aos))
> return -ENOMEM;
>
> - ios->layout = objio_seg;
> - ios->ol_state.ioerrs = ((void *)ios) + first_size;
> - ios->ol_state.num_comps = objio_seg->num_comps;
> + ios = &aos->objios;
>
> - *outp = &ios->ol_state;
> + ios->layout = objio_seg;
> + objlayout_init_ioerrs(&aos->objios.ol_state, objio_seg->num_comps,
> + aos->ioerrs, rpcdata, pnfs_layout_type);
> +
> + ios->pages = pages;
> + ios->pgbase = pgbase;
> + ios->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT;
> + ios->offset = offset;
> + ios->count = count;
> + ios->sync = 0;
> + BUG_ON(ios->nr_pages > (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT);
> +
> + *outp = ios;
> return 0;
> }
>
> -void objio_free_io_state(struct objlayout_io_state *ol_state)
> +void objio_free_result(struct objlayout_io_state *ol_state)
> {
> struct objio_state *ios = container_of(ol_state, struct objio_state,
> ol_state);
> @@ -598,7 +618,7 @@ static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg,
> if (per_dev->bio == NULL) {
> unsigned pages_in_stripe = ios->layout->group_width *
> (ios->layout->stripe_unit / PAGE_SIZE);
> - unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
> + unsigned bio_size = (ios->nr_pages + pages_in_stripe) /
> ios->layout->group_width;
>
> if (BIO_MAX_PAGES_KMALLOC < bio_size)
> @@ -615,11 +635,11 @@ static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg,
> unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
> unsigned added_len;
>
> - BUG_ON(ios->ol_state.nr_pages <= pg);
> + BUG_ON(ios->nr_pages <= pg);
> cur_len -= pglen;
>
> added_len = bio_add_pc_page(q, per_dev->bio,
> - ios->ol_state.pages[pg], pglen, pgbase);
> + ios->pages[pg], pglen, pgbase);
> if (unlikely(pglen != added_len))
> return -ENOMEM;
> pgbase = 0;
> @@ -660,7 +680,7 @@ static int _prepare_one_group(struct objio_state *ios, u64 length,
> cur_len = stripe_unit - si->unit_off;
> page_off = si->unit_off & ~PAGE_MASK;
> BUG_ON(page_off &&
> - (page_off != ios->ol_state.pgbase));
> + (page_off != ios->pgbase));
> } else { /* dev > si->dev */
> per_dev->offset = si->obj_offset - si->unit_off;
> cur_len = stripe_unit;
> @@ -693,8 +713,8 @@ out:
>
> static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags)
> {
> - u64 length = ios->ol_state.count;
> - u64 offset = ios->ol_state.offset;
> + u64 length = ios->count;
> + u64 offset = ios->offset;
> struct _striping_info si;
> unsigned last_pg = 0;
> int ret = 0;
> @@ -748,7 +768,7 @@ static int _io_exec(struct objio_state *ios)
> int ret = 0;
> unsigned i;
> objio_done_fn saved_done_fn = ios->done;
> - bool sync = ios->ol_state.sync;
> + bool sync = ios->sync;
>
> if (sync) {
> ios->done = _sync_done;
> @@ -792,7 +812,7 @@ static int _read_done(struct objio_state *ios)
> else
> status = ret;
>
> - objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync);
> + objlayout_read_done(&ios->ol_state, status, ios->sync);
> return ret;
> }
>
> @@ -854,12 +874,18 @@ err:
> return ret;
> }
>
> -int objio_read_pagelist(struct objlayout_io_state *ol_state)
> +int objio_read_pagelist(struct nfs_read_data *rdata)
> {
> - struct objio_state *ios = container_of(ol_state, struct objio_state,
> - ol_state);
> + struct objio_state *ios;
> int ret;
>
> + ret = objio_alloc_io_state(NFS_I(rdata->inode)->layout,
> + rdata->lseg, rdata->args.pages, rdata->args.pgbase,
> + rdata->args.offset, rdata->args.count, rdata,
> + GFP_KERNEL, &ios);
> + if (unlikely(ret))
> + return ret;
> +
> ret = _io_rw_pagelist(ios, GFP_KERNEL);
> if (unlikely(ret))
> return ret;
> @@ -886,7 +912,7 @@ static int _write_done(struct objio_state *ios)
> status = ret;
> }
>
> - objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync);
> + objlayout_write_done(&ios->ol_state, status, ios->sync);
> return ret;
> }
>
> @@ -976,12 +1002,20 @@ err:
> return ret;
> }
>
> -int objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable)
> +int objio_write_pagelist(struct nfs_write_data *wdata, int how)
> {
> - struct objio_state *ios = container_of(ol_state, struct objio_state,
> - ol_state);
> + struct objio_state *ios;
> int ret;
>
> + ret = objio_alloc_io_state(NFS_I(wdata->inode)->layout,
> + wdata->lseg, wdata->args.pages, wdata->args.pgbase,
> + wdata->args.offset, wdata->args.count, wdata, GFP_NOFS,
> + &ios);
> + if (unlikely(ret))
> + return ret;
> +
> + ios->sync = 0 != (how & FLUSH_SYNC);
> +
> /* TODO: ios->stable = stable; */
> ret = _io_rw_pagelist(ios, GFP_NOFS);
> if (unlikely(ret))
> diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
> index 99c807d..a82053a 100644
> --- a/fs/nfs/objlayout/objlayout.c
> +++ b/fs/nfs/objlayout/objlayout.c
> @@ -156,59 +156,23 @@ last_byte_offset(u64 start, u64 len)
> return end > start ? end - 1 : NFS4_MAX_UINT64;
> }
>
> -static struct objlayout_io_state *
> -objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
> - struct page **pages,
> - unsigned pgbase,
> - loff_t offset,
> - size_t count,
> - struct pnfs_layout_segment *lseg,
> - void *rpcdata,
> - gfp_t gfp_flags)
> +void _fix_verify_io_params(struct pnfs_layout_segment *lseg,
> + struct page ***p_pages, unsigned *p_pgbase,
> + u64 offset, unsigned long count)
> {
> - struct objlayout_io_state *state;
> u64 lseg_end_offset;
>
> - dprintk("%s: allocating io_state\n", __func__);
> - if (objio_alloc_io_state(lseg, &state, gfp_flags))
> - return NULL;
> -
> BUG_ON(offset < lseg->pls_range.offset);
> lseg_end_offset = end_offset(lseg->pls_range.offset,
> lseg->pls_range.length);
> BUG_ON(offset >= lseg_end_offset);
> - if (offset + count > lseg_end_offset) {
> - count = lseg->pls_range.length -
> - (offset - lseg->pls_range.offset);
> - dprintk("%s: truncated count %Zd\n", __func__, count);
> - }
> + WARN_ON(offset + count > lseg_end_offset);
>
> - if (pgbase > PAGE_SIZE) {
> - pages += pgbase >> PAGE_SHIFT;
> - pgbase &= ~PAGE_MASK;
> + if (*p_pgbase > PAGE_SIZE) {
> + dprintk("%s: pgbase(0x%x) > PAGE_SIZE\n", __func__, *p_pgbase);
> + *p_pages += *p_pgbase >> PAGE_SHIFT;
> + *p_pgbase &= ~PAGE_MASK;
> }
> -
> - INIT_LIST_HEAD(&state->err_list);
> - state->lseg = lseg;
> - state->rpcdata = rpcdata;
> - state->pages = pages;
> - state->pgbase = pgbase;
> - state->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT;
> - state->offset = offset;
> - state->count = count;
> - state->sync = 0;
> -
> - return state;
> -}
> -
> -static void
> -objlayout_free_io_state(struct objlayout_io_state *state)
> -{
> - dprintk("%s: freeing io_state\n", __func__);
> - if (unlikely(!state))
> - return;
> -
> - objio_free_io_state(state);
> }
>
> /*
> @@ -217,12 +181,10 @@ objlayout_free_io_state(struct objlayout_io_state *state)
> static void
> objlayout_iodone(struct objlayout_io_state *state)
> {
> - dprintk("%s: state %p status\n", __func__, state);
> -
> if (likely(state->status >= 0)) {
> - objlayout_free_io_state(state);
> + objio_free_result(state);
> } else {
> - struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
> + struct objlayout *objlay = state->objlay;
>
> spin_lock(&objlay->lock);
> objlay->delta_space_valid = OBJ_DSU_INVALID;
> @@ -289,15 +251,15 @@ objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync)
> {
> struct nfs_read_data *rdata = state->rpcdata;
>
> - state->status = status;
> - dprintk("%s: Begin status=%zd eof=%d\n", __func__,
> - status, rdata->res.eof);
> - rdata->task.tk_status = status;
> + state->status = rdata->task.tk_status = status;
> if (status >= 0)
> rdata->res.count = status;
> objlayout_iodone(state);
> /* must not use state after this point */
>
> + dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__,
> + status, rdata->res.eof, sync);
> +
> if (sync)
> pnfs_ld_read_done(rdata);
> else {
> @@ -314,7 +276,6 @@ objlayout_read_pagelist(struct nfs_read_data *rdata)
> {
> loff_t offset = rdata->args.offset;
> size_t count = rdata->args.count;
> - struct objlayout_io_state *state;
> int err;
> loff_t eof;
>
> @@ -331,20 +292,14 @@ objlayout_read_pagelist(struct nfs_read_data *rdata)
> }
>
> rdata->res.eof = (offset + count) >= eof;
> + _fix_verify_io_params(rdata->lseg, &rdata->args.pages,
> + &rdata->args.pgbase,
> + rdata->args.offset, rdata->args.count);
>
> - state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout,
> - rdata->args.pages, rdata->args.pgbase,
> - offset, count,
> - rdata->lseg, rdata,
> - GFP_KERNEL);
> - if (unlikely(!state)) {
> - err = -ENOMEM;
> - goto out;
> - }
> dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n",
> __func__, rdata->inode->i_ino, offset, count, rdata->res.eof);
>
> - err = objio_read_pagelist(state);
> + err = objio_read_pagelist(rdata);
> out:
> if (unlikely(err)) {
> rdata->pnfs_error = err;
> @@ -374,23 +329,18 @@ void
> objlayout_write_done(struct objlayout_io_state *state, ssize_t status,
> bool sync)
> {
> - struct nfs_write_data *wdata;
> + struct nfs_write_data *wdata = state->rpcdata;
>
> - dprintk("%s: Begin\n", __func__);
> - wdata = state->rpcdata;
> - state->status = status;
> - wdata->task.tk_status = status;
> + state->status = wdata->task.tk_status = status;
> if (status >= 0) {
> wdata->res.count = status;
> wdata->verf.committed = state->committed;
> - dprintk("%s: Return status %d committed %d\n",
> - __func__, wdata->task.tk_status,
> - wdata->verf.committed);
> - } else
> - dprintk("%s: Return status %d\n",
> - __func__, wdata->task.tk_status);
> + }
> objlayout_iodone(state);
> - /* must not use state after this point */
> + /* must not use oir after this point */
> +
> + dprintk("%s: Return status %zd committed %d sync=%d\n", __func__,
> + status, wdata->verf.committed, sync);
>
> if (sync)
> pnfs_ld_write_done(wdata);
> @@ -407,25 +357,13 @@ enum pnfs_try_status
> objlayout_write_pagelist(struct nfs_write_data *wdata,
> int how)
> {
> - struct objlayout_io_state *state;
> int err;
>
> - state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout,
> - wdata->args.pages,
> - wdata->args.pgbase,
> - wdata->args.offset,
> - wdata->args.count,
> - wdata->lseg, wdata,
> - GFP_NOFS);
> - if (unlikely(!state)) {
> - err = -ENOMEM;
> - goto out;
> - }
> + _fix_verify_io_params(wdata->lseg, &wdata->args.pages,
> + &wdata->args.pgbase,
> + wdata->args.offset, wdata->args.count);
>
> - state->sync = how & FLUSH_SYNC;
> -
> - err = objio_write_pagelist(state, how & FLUSH_STABLE);
> - out:
> + err = objio_write_pagelist(wdata, how);
> if (unlikely(err)) {
> wdata->pnfs_error = err;
> dprintk("%s: Returned Error %d\n", __func__, err);
> @@ -564,7 +502,7 @@ encode_accumulated_error(struct objlayout *objlay, __be32 *p)
> merge_ioerr(&accumulated_err, ioerr);
> }
> list_del(&state->err_list);
> - objlayout_free_io_state(state);
> + objio_free_result(state);
> }
>
> pnfs_osd_xdr_encode_ioerr(p, &accumulated_err);
> @@ -632,7 +570,7 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
> goto loop_done;
> }
> list_del(&state->err_list);
> - objlayout_free_io_state(state);
> + objio_free_result(state);
> }
> loop_done:
> spin_unlock(&objlay->lock);
> diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
> index 4edac9b..d7b2ccfa 100644
> --- a/fs/nfs/objlayout/objlayout.h
> +++ b/fs/nfs/objlayout/objlayout.h
> @@ -75,14 +75,7 @@ OBJLAYOUT(struct pnfs_layout_hdr *lo)
> * embedded in objects provider io_state data structure
> */
> struct objlayout_io_state {
> - struct pnfs_layout_segment *lseg;
> -
> - struct page **pages;
> - unsigned pgbase;
> - unsigned nr_pages;
> - unsigned long count;
> - loff_t offset;
> - bool sync;
> + struct objlayout *objlay;
>
> void *rpcdata;
> int status; /* res */
> @@ -99,6 +92,18 @@ struct objlayout_io_state {
> struct pnfs_osd_ioerr *ioerrs;
> };
>
> +static inline
> +void objlayout_init_ioerrs(struct objlayout_io_state *oir, unsigned num_comps,
> + struct pnfs_osd_ioerr *ioerrs, void *rpcdata,
> + struct pnfs_layout_hdr *pnfs_layout_type)
> +{
> + oir->objlay = OBJLAYOUT(pnfs_layout_type);
> + oir->rpcdata = rpcdata;
> + INIT_LIST_HEAD(&oir->err_list);
> + oir->num_comps = num_comps;
> + oir->ioerrs = ioerrs;
> +}
> +
> /*
> * Raid engine I/O API
> */
> @@ -109,15 +114,10 @@ extern int objio_alloc_lseg(struct pnfs_layout_segment **outp,
> gfp_t gfp_flags);
> extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
>
> -extern int objio_alloc_io_state(
> - struct pnfs_layout_segment *lseg,
> - struct objlayout_io_state **outp,
> - gfp_t gfp_flags);
> -extern void objio_free_io_state(struct objlayout_io_state *state);
> +extern void objio_free_result(struct objlayout_io_state *state);
>
> -extern int objio_read_pagelist(struct objlayout_io_state *ol_state);
> -extern int objio_write_pagelist(struct objlayout_io_state *ol_state,
> - bool stable);
> +extern int objio_read_pagelist(struct nfs_read_data *rdata);
> +extern int objio_write_pagelist(struct nfs_write_data *wdata, int how);
>
> /*
> * callback API
> @@ -127,10 +127,8 @@ extern void objlayout_io_set_result(struct objlayout_io_state *state,
> int osd_error, u64 offset, u64 length, bool is_write);
>
> static inline void
> -objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used)
> +objlayout_add_delta_space_used(struct objlayout *objlay, s64 space_used)
> {
> - struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
> -
> /* If one of the I/Os errored out and the delta_space_used was
> * invalid we render the complete report as invalid. Protocol mandate
> * the DSU be accurate or not reported.

2011-10-11 02:35:54

by Benny Halevy

[permalink] [raw]
Subject: Re: [PATCH 19/19] pnfs-obj: move to ore 03: Remove old raid engine

On 2011-10-04 06:37, Boaz Harrosh wrote:
> Finally remove all the old raid engine, which is by now
> dead code.
>
> Signed-off-by: Boaz Harrosh <[email protected]>

Acked-by: Benny Halevy <[email protected]>

Thanks!

> ---
> fs/nfs/objlayout/objio_osd.c | 504 ------------------------------------------
> 1 files changed, 0 insertions(+), 504 deletions(-)
>
> diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
> index 00b3849..3161da6 100644
> --- a/fs/nfs/objlayout/objio_osd.c
> +++ b/fs/nfs/objlayout/objio_osd.c
> @@ -188,48 +188,6 @@ out:
> return err;
> }
>
> -#if 0
> -static int _verify_data_map(struct pnfs_osd_layout *layout)
> -{
> - struct pnfs_osd_data_map *data_map = &layout->olo_map;
> - u64 stripe_length;
> - u32 group_width;
> -
> -/* FIXME: Only raid0 for now. if not go through MDS */
> - if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) {
> - printk(KERN_ERR "Only RAID_0 for now\n");
> - return -ENOTSUPP;
> - }
> - if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) {
> - printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n",
> - data_map->odm_num_comps, data_map->odm_mirror_cnt);
> - return -EINVAL;
> - }
> -
> - if (data_map->odm_group_width)
> - group_width = data_map->odm_group_width;
> - else
> - group_width = data_map->odm_num_comps /
> - (data_map->odm_mirror_cnt + 1);
> -
> - stripe_length = (u64)data_map->odm_stripe_unit * group_width;
> - if (stripe_length >= (1ULL << 32)) {
> - printk(KERN_ERR "Total Stripe length(0x%llx)"
> - " >= 32bit is not supported\n", _LLU(stripe_length));
> - return -ENOTSUPP;
> - }
> -
> - if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) {
> - printk(KERN_ERR "Stripe Unit(0x%llx)"
> - " must be Multples of PAGE_SIZE(0x%lx)\n",
> - _LLU(data_map->odm_stripe_unit), PAGE_SIZE);
> - return -ENOTSUPP;
> - }
> -
> - return 0;
> -}
> -#endif
> -
> static void copy_single_comp(struct ore_components *oc, unsigned c,
> struct pnfs_osd_object_cred *src_comp)
> {
> @@ -441,327 +399,6 @@ static void __on_dev_error(struct ore_io_state *ios,
> dev_offset, dev_len, !ios->reading);
> }
>
> -#if 0
> -static void _clear_bio(struct bio *bio)
> -{
> - struct bio_vec *bv;
> - unsigned i;
> -
> - __bio_for_each_segment(bv, bio, i, 0) {
> - unsigned this_count = bv->bv_len;
> -
> - if (likely(PAGE_SIZE == this_count))
> - clear_highpage(bv->bv_page);
> - else
> - zero_user(bv->bv_page, bv->bv_offset, this_count);
> - }
> -}
> -
> -static int _io_check(struct objio_state *ios, bool is_write)
> -{
> - enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR;
> - int lin_ret = 0;
> - int i;
> -
> - for (i = 0; i < ios->numdevs; i++) {
> - struct osd_sense_info osi;
> - struct osd_request *or = ios->per_dev[i].or;
> - int ret;
> -
> - if (!or)
> - continue;
> -
> - ret = osd_req_decode_sense(or, &osi);
> - if (likely(!ret))
> - continue;
> -
> - if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
> - /* start read offset passed endof file */
> - BUG_ON(is_write);
> - _clear_bio(ios->per_dev[i].bio);
> - dprintk("%s: start read offset passed end of file "
> - "offset=0x%llx, length=0x%lx\n", __func__,
> - _LLU(ios->per_dev[i].offset),
> - ios->per_dev[i].length);
> -
> - continue; /* we recovered */
> - }
> - __on_dev_error(ios, is_write, ios->oc->ods[i],
> - ios->per_dev[i].dev, osi.osd_err_pri,
> - ios->per_dev[i].offset, ios->per_dev[i].length);
> -
> - if (osi.osd_err_pri >= oep) {
> - oep = osi.osd_err_pri;
> - lin_ret = ret;
> - }
> - }
> -
> - return lin_ret;
> -}
> -
> -/*
> - * Common IO state helpers.
> - */
> -static void _io_free(struct objio_state *ios)
> -{
> - unsigned i;
> -
> - for (i = 0; i < ios->numdevs; i++) {
> - struct _objio_per_comp *per_dev = &ios->per_dev[i];
> -
> - if (per_dev->or) {
> - osd_end_request(per_dev->or);
> - per_dev->or = NULL;
> - }
> -
> - if (per_dev->bio) {
> - bio_put(per_dev->bio);
> - per_dev->bio = NULL;
> - }
> - }
> -}
> -
> -struct osd_dev *_io_od(struct objio_state *ios, unsigned dev)
> -{
> - unsigned min_dev = ios->oc->first_dev;
> - unsigned max_dev = min_dev + ios->oc->numdevs;
> -
> - BUG_ON(dev < min_dev || max_dev <= dev);
> - return ios->oc->ods[dev - min_dev]->od;
> -}
> -
> -struct _striping_info {
> - u64 obj_offset;
> - u64 group_length;
> - unsigned dev;
> - unsigned unit_off;
> -};
> -
> -static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
> - struct _striping_info *si)
> -{
> - u32 stripe_unit = ios->layout->stripe_unit;
> - u32 group_width = ios->layout->group_width;
> - u64 group_depth = ios->layout->group_depth;
> - u32 U = stripe_unit * group_width;
> -
> - u64 T = U * group_depth;
> - u64 S = T * ios->layout->group_count;
> - u64 M = div64_u64(file_offset, S);
> -
> - /*
> - G = (L - (M * S)) / T
> - H = (L - (M * S)) % T
> - */
> - u64 LmodU = file_offset - M * S;
> - u32 G = div64_u64(LmodU, T);
> - u64 H = LmodU - G * T;
> -
> - u32 N = div_u64(H, U);
> -
> - div_u64_rem(file_offset, stripe_unit, &si->unit_off);
> - si->obj_offset = si->unit_off + (N * stripe_unit) +
> - (M * group_depth * stripe_unit);
> -
> - /* "H - (N * U)" is just "H % U" so it's bound to u32 */
> - si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
> - si->dev *= ios->layout->mirrors_p1;
> -
> - si->group_length = T - H;
> -}
> -
> -static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg,
> - unsigned pgbase, struct _objio_per_comp *per_dev, int len,
> - gfp_t gfp_flags)
> -{
> - unsigned pg = *cur_pg;
> - int cur_len = len;
> - struct request_queue *q =
> - osd_request_queue(_io_od(ios, per_dev->dev));
> -
> - if (per_dev->bio == NULL) {
> - unsigned pages_in_stripe = ios->layout->group_width *
> - (ios->layout->stripe_unit / PAGE_SIZE);
> - unsigned bio_size = (ios->nr_pages + pages_in_stripe) /
> - ios->layout->group_width;
> -
> - if (BIO_MAX_PAGES_KMALLOC < bio_size)
> - bio_size = BIO_MAX_PAGES_KMALLOC;
> -
> - per_dev->bio = bio_kmalloc(gfp_flags, bio_size);
> - if (unlikely(!per_dev->bio)) {
> - dprintk("Faild to allocate BIO size=%u\n", bio_size);
> - return -ENOMEM;
> - }
> - }
> -
> - while (cur_len > 0) {
> - unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
> - unsigned added_len;
> -
> - BUG_ON(ios->nr_pages <= pg);
> - cur_len -= pglen;
> -
> - added_len = bio_add_pc_page(q, per_dev->bio,
> - ios->pages[pg], pglen, pgbase);
> - if (unlikely(pglen != added_len))
> - return -ENOMEM;
> - pgbase = 0;
> - ++pg;
> - }
> - BUG_ON(cur_len);
> -
> - per_dev->length += len;
> - *cur_pg = pg;
> - return 0;
> -}
> -
> -static int _prepare_one_group(struct objio_state *ios, u64 length,
> - struct _striping_info *si, unsigned *last_pg,
> - gfp_t gfp_flags)
> -{
> - unsigned stripe_unit = ios->layout->stripe_unit;
> - unsigned mirrors_p1 = ios->layout->mirrors_p1;
> - unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
> - unsigned dev = si->dev;
> - unsigned first_dev = dev - (dev % devs_in_group);
> - unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
> - unsigned cur_pg = *last_pg;
> - int ret = 0;
> -
> - while (length) {
> - struct _objio_per_comp *per_dev = &ios->per_dev[dev - first_dev];
> - unsigned cur_len, page_off = 0;
> -
> - if (!per_dev->length) {
> - per_dev->dev = dev;
> - if (dev < si->dev) {
> - per_dev->offset = si->obj_offset + stripe_unit -
> - si->unit_off;
> - cur_len = stripe_unit;
> - } else if (dev == si->dev) {
> - per_dev->offset = si->obj_offset;
> - cur_len = stripe_unit - si->unit_off;
> - page_off = si->unit_off & ~PAGE_MASK;
> - BUG_ON(page_off &&
> - (page_off != ios->pgbase));
> - } else { /* dev > si->dev */
> - per_dev->offset = si->obj_offset - si->unit_off;
> - cur_len = stripe_unit;
> - }
> -
> - if (max_comp < dev - first_dev)
> - max_comp = dev - first_dev;
> - } else {
> - cur_len = stripe_unit;
> - }
> - if (cur_len >= length)
> - cur_len = length;
> -
> - ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
> - cur_len, gfp_flags);
> - if (unlikely(ret))
> - goto out;
> -
> - dev += mirrors_p1;
> - dev = (dev % devs_in_group) + first_dev;
> -
> - length -= cur_len;
> - ios->length += cur_len;
> - }
> -out:
> - ios->numdevs = max_comp + mirrors_p1;
> - *last_pg = cur_pg;
> - return ret;
> -}
> -
> -static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags)
> -{
> - u64 length = ios->count;
> - u64 offset = ios->offset;
> - struct _striping_info si;
> - unsigned last_pg = 0;
> - int ret = 0;
> -
> - while (length) {
> - _calc_stripe_info(ios, offset, &si);
> -
> - if (length < si.group_length)
> - si.group_length = length;
> -
> - ret = _prepare_one_group(ios, si.group_length, &si, &last_pg, gfp_flags);
> - if (unlikely(ret))
> - goto out;
> -
> - offset += si.group_length;
> - length -= si.group_length;
> - }
> -
> -out:
> - if (!ios->length)
> - return ret;
> -
> - return 0;
> -}
> -
> -static int _sync_done(struct objio_state *ios)
> -{
> - struct completion *waiting = ios->private;
> -
> - complete(waiting);
> - return 0;
> -}
> -
> -static void _last_io(struct kref *kref)
> -{
> - struct objio_state *ios = container_of(kref, struct objio_state, kref);
> -
> - ios->done(ios);
> -}
> -
> -static void _done_io(struct osd_request *or, void *p)
> -{
> - struct objio_state *ios = p;
> -
> - kref_put(&ios->kref, _last_io);
> -}
> -
> -static int _io_exec(struct objio_state *ios)
> -{
> - DECLARE_COMPLETION_ONSTACK(wait);
> - int ret = 0;
> - unsigned i;
> - objio_done_fn saved_done_fn = ios->done;
> - bool sync = ios->sync;
> -
> - if (sync) {
> - ios->done = _sync_done;
> - ios->private = &wait;
> - }
> -
> - kref_init(&ios->kref);
> -
> - for (i = 0; i < ios->numdevs; i++) {
> - struct osd_request *or = ios->per_dev[i].or;
> -
> - if (!or)
> - continue;
> -
> - kref_get(&ios->kref);
> - osd_execute_request_async(or, _done_io, ios);
> - }
> -
> - kref_put(&ios->kref, _last_io);
> -
> - if (sync) {
> - wait_for_completion(&wait);
> - ret = saved_done_fn(ios);
> - }
> -
> - return ret;
> -}
> -#endif
> -
> /*
> * read
> */
> @@ -781,63 +418,6 @@ static void _read_done(struct ore_io_state *ios, void *private)
> objlayout_read_done(&objios->oir, status, objios->sync);
> }
>
> -#if 0
> -static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
> -{
> - struct osd_request *or = NULL;
> - struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
> - unsigned dev = per_dev->dev;
> - struct ore_comp *cred =
> - &ios->oc->comps[cur_comp];
> - struct osd_obj_id obj = cred->obj;
> - int ret;
> -
> - or = osd_start_request(_io_od(ios, dev), GFP_KERNEL);
> - if (unlikely(!or)) {
> - ret = -ENOMEM;
> - goto err;
> - }
> - per_dev->or = or;
> -
> - osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length);
> -
> - ret = osd_finalize_request(or, 0, cred->cred, NULL);
> - if (ret) {
> - dprintk("%s: Faild to osd_finalize_request() => %d\n",
> - __func__, ret);
> - goto err;
> - }
> -
> - dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
> - __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
> - per_dev->length);
> -
> -err:
> - return ret;
> -}
> -
> -static int _read_exec(struct objio_state *ios)
> -{
> - unsigned i;
> - int ret;
> -
> - for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
> - if (!ios->per_dev[i].length)
> - continue;
> - ret = _read_mirrors(ios, i);
> - if (unlikely(ret))
> - goto err;
> - }
> -
> - ios->done = _read_done;
> - return _io_exec(ios);
> -
> -err:
> - _io_free(ios);
> - return ret;
> -}
> -#endif
> -
> int objio_read_pagelist(struct nfs_read_data *rdata)
> {
> struct objio_state *objios;
> @@ -879,90 +459,6 @@ static void _write_done(struct ore_io_state *ios, void *private)
> objlayout_write_done(&objios->oir, status, objios->sync);
> }
>
> -#if 0
> -static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
> -{
> - struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp];
> - unsigned dev = ios->per_dev[cur_comp].dev;
> - unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
> - int ret;
> -
> - for (; cur_comp < last_comp; ++cur_comp, ++dev) {
> - struct osd_request *or = NULL;
> - struct ore_comp *cred = &ios->oc->comps[cur_comp];
> - struct osd_obj_id obj = cred->obj;
> - struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
> - struct bio *bio;
> -
> - or = osd_start_request(_io_od(ios, dev), GFP_NOFS);
> - if (unlikely(!or)) {
> - ret = -ENOMEM;
> - goto err;
> - }
> - per_dev->or = or;
> -
> - if (per_dev != master_dev) {
> - bio = bio_kmalloc(GFP_NOFS,
> - master_dev->bio->bi_max_vecs);
> - if (unlikely(!bio)) {
> - dprintk("Faild to allocate BIO size=%u\n",
> - master_dev->bio->bi_max_vecs);
> - ret = -ENOMEM;
> - goto err;
> - }
> -
> - __bio_clone(bio, master_dev->bio);
> - bio->bi_bdev = NULL;
> - bio->bi_next = NULL;
> - per_dev->bio = bio;
> - per_dev->dev = dev;
> - per_dev->length = master_dev->length;
> - per_dev->offset = master_dev->offset;
> - } else {
> - bio = master_dev->bio;
> - bio->bi_rw |= REQ_WRITE;
> - }
> -
> - osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length);
> -
> - ret = osd_finalize_request(or, 0, cred->cred, NULL);
> - if (ret) {
> - dprintk("%s: Faild to osd_finalize_request() => %d\n",
> - __func__, ret);
> - goto err;
> - }
> -
> - dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
> - __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
> - per_dev->length);
> - }
> -
> -err:
> - return ret;
> -}
> -
> -static int _write_exec(struct objio_state *ios)
> -{
> - unsigned i;
> - int ret;
> -
> - for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
> - if (!ios->per_dev[i].length)
> - continue;
> - ret = _write_mirrors(ios, i);
> - if (unlikely(ret))
> - goto err;
> - }
> -
> - ios->done = _write_done;
> - return _io_exec(ios);
> -
> -err:
> - _io_free(ios);
> - return ret;
> -}
> -#endif
> -
> int objio_write_pagelist(struct nfs_write_data *wdata, int how)
> {
> struct objio_state *objios;

2011-10-11 02:34:38

by Benny Halevy

[permalink] [raw]
Subject: Re: [PATCH 18/19] pnfs-obj: move to ore 02: move to ORE

On 2011-10-04 06:36, Boaz Harrosh wrote:
> In this patch we are actually moving to the ORE.
> (Object Raid Engine).
>
> objio_state holds a pointer to an ore_io_state. Once
> we have an ore_io_state at hand we can call the ore
> for reading/writting. We register on the done path
> to kick off the nfs io_done mechanism.
>
> Again for Ease of reviewing the old code is "#if 0"
> but is not removed so the diff command works better.
> The old code will be removed in the next patch.
>
> Signed-off-by: Boaz Harrosh <[email protected]>

Acked-by: Benny Halevy <[email protected]>

Thanks!

> ---
> fs/nfs/objlayout/objio_osd.c | 133 +++++++++++++++++++-----------------------
> 1 files changed, 59 insertions(+), 74 deletions(-)
>
> diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
> index bd7ec26..00b3849 100644
> --- a/fs/nfs/objlayout/objio_osd.c
> +++ b/fs/nfs/objlayout/objio_osd.c
> @@ -44,12 +44,6 @@
>
> #define NFSDBG_FACILITY NFSDBG_PNFS_LD
>
> -#define _LLU(x) ((unsigned long long)x)
> -
> -enum { BIO_MAX_PAGES_KMALLOC =
> - (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
> -};
> -
> struct objio_dev_ent {
> struct nfs4_deviceid_node id_node;
> struct ore_dev od;
> @@ -124,37 +118,13 @@ OBJIO_LSEG(struct pnfs_layout_segment *lseg)
> return container_of(lseg, struct objio_segment, lseg);
> }
>
> -struct objio_state;
> -typedef int (*objio_done_fn)(struct objio_state *ios);
> -
> struct objio_state {
> /* Generic layer */
> struct objlayout_io_res oir;
>
> - struct page **pages;
> - unsigned pgbase;
> - unsigned nr_pages;
> - unsigned long count;
> - loff_t offset;
> bool sync;
> -
> - struct ore_layout *layout;
> - struct ore_components *oc;
> -
> - struct kref kref;
> - objio_done_fn done;
> - void *private;
> -
> - unsigned long length;
> - unsigned numdevs; /* Actually used devs in this IO */
> - /* A per-device variable array of size numdevs */
> - struct _objio_per_comp {
> - struct bio *bio;
> - struct osd_request *or;
> - unsigned long length;
> - u64 offset;
> - unsigned dev;
> - } per_dev[];
> + /*FIXME: Support for extra_bytes at ore_get_rw_state() */
> + struct ore_io_state *ios;
> };
>
> /* Send and wait for a get_device_info of devices in the layout,
> @@ -374,16 +344,16 @@ void objio_free_lseg(struct pnfs_layout_segment *lseg)
> }
>
> static int
> -objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
> +objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, bool is_reading,
> struct pnfs_layout_segment *lseg, struct page **pages, unsigned pgbase,
> loff_t offset, size_t count, void *rpcdata, gfp_t gfp_flags,
> struct objio_state **outp)
> {
> struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
> - struct objio_state *ios;
> + struct ore_io_state *ios;
> + int ret;
> struct __alloc_objio_state {
> struct objio_state objios;
> - struct _objio_per_comp per_dev[objio_seg->oc.numdevs];
> struct pnfs_osd_ioerr ioerrs[objio_seg->oc.numdevs];
> } *aos;
>
> @@ -391,30 +361,33 @@ objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
> if (unlikely(!aos))
> return -ENOMEM;
>
> - ios = &aos->objios;
> -
> - ios->layout = &objio_seg->layout;
> - ios->oc = &objio_seg->oc;
> objlayout_init_ioerrs(&aos->objios.oir, objio_seg->oc.numdevs,
> aos->ioerrs, rpcdata, pnfs_layout_type);
>
> + ret = ore_get_rw_state(&objio_seg->layout, &objio_seg->oc, is_reading,
> + offset, count, &ios);
> + if (unlikely(ret)) {
> + kfree(aos);
> + return ret;
> + }
> +
> ios->pages = pages;
> ios->pgbase = pgbase;
> - ios->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT;
> - ios->offset = offset;
> - ios->count = count;
> - ios->sync = 0;
> + ios->private = aos;
> BUG_ON(ios->nr_pages > (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT);
>
> - *outp = ios;
> + aos->objios.sync = 0;
> + aos->objios.ios = ios;
> + *outp = &aos->objios;
> return 0;
> }
>
> void objio_free_result(struct objlayout_io_res *oir)
> {
> - struct objio_state *ios = container_of(oir, struct objio_state, oir);
> + struct objio_state *objios = container_of(oir, struct objio_state, oir);
>
> - kfree(ios);
> + ore_put_io_state(objios->ios);
> + kfree(objios);
> }
>
> enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
> @@ -447,7 +420,7 @@ enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
> }
> }
>
> -static void __on_dev_error(struct objio_state *ios, bool is_write,
> +static void __on_dev_error(struct ore_io_state *ios,
> struct ore_dev *od, unsigned dev_index, enum osd_err_priority oep,
> u64 dev_offset, u64 dev_len)
> {
> @@ -465,9 +438,10 @@ static void __on_dev_error(struct objio_state *ios, bool is_write,
>
> objlayout_io_set_result(&objios->oir, comp,
> &pooid, osd_pri_2_pnfs_err(oep),
> - dev_offset, dev_len, is_write);
> + dev_offset, dev_len, !ios->reading);
> }
>
> +#if 0
> static void _clear_bio(struct bio *bio)
> {
> struct bio_vec *bv;
> @@ -786,26 +760,28 @@ static int _io_exec(struct objio_state *ios)
>
> return ret;
> }
> +#endif
>
> /*
> * read
> */
> -static int _read_done(struct objio_state *ios)
> +static void _read_done(struct ore_io_state *ios, void *private)
> {
> + struct objio_state *objios = private;
> ssize_t status;
> - int ret = _io_check(ios, false);
> + int ret = ore_check_io(ios, &__on_dev_error);
>
> - _io_free(ios);
> + /* FIXME: _io_free(ios) can we dealocate the libosd resources; */
>
> if (likely(!ret))
> status = ios->length;
> else
> status = ret;
>
> - objlayout_read_done(&ios->oir, status, ios->sync);
> - return ret;
> + objlayout_read_done(&objios->oir, status, objios->sync);
> }
>
> +#if 0
> static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
> {
> struct osd_request *or = NULL;
> @@ -860,49 +836,50 @@ err:
> _io_free(ios);
> return ret;
> }
> +#endif
>
> int objio_read_pagelist(struct nfs_read_data *rdata)
> {
> - struct objio_state *ios;
> + struct objio_state *objios;
> int ret;
>
> - ret = objio_alloc_io_state(NFS_I(rdata->inode)->layout,
> + ret = objio_alloc_io_state(NFS_I(rdata->inode)->layout, true,
> rdata->lseg, rdata->args.pages, rdata->args.pgbase,
> rdata->args.offset, rdata->args.count, rdata,
> - GFP_KERNEL, &ios);
> - if (unlikely(ret))
> - return ret;
> -
> - ret = _io_rw_pagelist(ios, GFP_KERNEL);
> + GFP_KERNEL, &objios);
> if (unlikely(ret))
> return ret;
>
> - return _read_exec(ios);
> + objios->ios->done = _read_done;
> + dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
> + rdata->args.offset, rdata->args.count);
> + return ore_read(objios->ios);
> }
>
> /*
> * write
> */
> -static int _write_done(struct objio_state *ios)
> +static void _write_done(struct ore_io_state *ios, void *private)
> {
> + struct objio_state *objios = private;
> ssize_t status;
> - int ret = _io_check(ios, true);
> + int ret = ore_check_io(ios, &__on_dev_error);
>
> - _io_free(ios);
> + /* FIXME: _io_free(ios) can we dealocate the libosd resources; */
>
> if (likely(!ret)) {
> /* FIXME: should be based on the OSD's persistence model
> * See OSD2r05 Section 4.13 Data persistence model */
> - ios->oir.committed = NFS_FILE_SYNC;
> + objios->oir.committed = NFS_FILE_SYNC;
> status = ios->length;
> } else {
> status = ret;
> }
>
> - objlayout_write_done(&ios->oir, status, ios->sync);
> - return ret;
> + objlayout_write_done(&objios->oir, status, objios->sync);
> }
>
> +#if 0
> static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
> {
> struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp];
> @@ -984,27 +961,35 @@ err:
> _io_free(ios);
> return ret;
> }
> +#endif
>
> int objio_write_pagelist(struct nfs_write_data *wdata, int how)
> {
> - struct objio_state *ios;
> + struct objio_state *objios;
> int ret;
>
> - ret = objio_alloc_io_state(NFS_I(wdata->inode)->layout,
> + ret = objio_alloc_io_state(NFS_I(wdata->inode)->layout, false,
> wdata->lseg, wdata->args.pages, wdata->args.pgbase,
> wdata->args.offset, wdata->args.count, wdata, GFP_NOFS,
> - &ios);
> + &objios);
> if (unlikely(ret))
> return ret;
>
> - ios->sync = 0 != (how & FLUSH_SYNC);
> + objios->sync = 0 != (how & FLUSH_SYNC);
>
> - /* TODO: ios->stable = stable; */
> - ret = _io_rw_pagelist(ios, GFP_NOFS);
> + if (!objios->sync)
> + objios->ios->done = _write_done;
> +
> + dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
> + wdata->args.offset, wdata->args.count);
> + ret = ore_write(objios->ios);
> if (unlikely(ret))
> return ret;
>
> - return _write_exec(ios);
> + if (objios->sync)
> + _write_done(objios->ios, objios);
> +
> + return 0;
> }
>
> static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,

2011-10-11 02:32:10

by Benny Halevy

[permalink] [raw]
Subject: Re: [PATCH 14/19] pnfs-obj: Return PNFS_NOT_ATTEMPTED in case of read/write_pagelist

On 2011-10-04 06:35, Boaz Harrosh wrote:
> objlayout driver was always returning PNFS_ATTEMPTED from it's
> read/write_pagelist operations. Even on error. Fix that.
>
> Start by establishing an error return API from io-engine, by
> not returning ssize_t (length-or-error) but returning "int"
> 0=OK, 0>Error. And clean up all return types in io-engine.
>
> Then if io-engine returned error return PNFS_NOT_ATTEMPTED
> to generic layer. (With a dprint)
>
> Signed-off-by: Boaz Harrosh <[email protected]>

Looks good to me!

Reviewed-by: Benny Halevy <[email protected]>

> ---
> fs/nfs/objlayout/objio_osd.c | 32 ++++++++++++++++----------------
> fs/nfs/objlayout/objlayout.c | 36 +++++++++++++++++++-----------------
> fs/nfs/objlayout/objlayout.h | 4 ++--
> 3 files changed, 37 insertions(+), 35 deletions(-)
>
> diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
> index d0cda12..0c7c9ec 100644
> --- a/fs/nfs/objlayout/objio_osd.c
> +++ b/fs/nfs/objlayout/objio_osd.c
> @@ -142,7 +142,7 @@ OBJIO_LSEG(struct pnfs_layout_segment *lseg)
> }
>
> struct objio_state;
> -typedef ssize_t (*objio_done_fn)(struct objio_state *ios);
> +typedef int (*objio_done_fn)(struct objio_state *ios);
>
> struct objio_state {
> /* Generic layer */
> @@ -720,7 +720,7 @@ out:
> return 0;
> }
>
> -static ssize_t _sync_done(struct objio_state *ios)
> +static int _sync_done(struct objio_state *ios)
> {
> struct completion *waiting = ios->private;
>
> @@ -742,10 +742,10 @@ static void _done_io(struct osd_request *or, void *p)
> kref_put(&ios->kref, _last_io);
> }
>
> -static ssize_t _io_exec(struct objio_state *ios)
> +static int _io_exec(struct objio_state *ios)
> {
> DECLARE_COMPLETION_ONSTACK(wait);
> - ssize_t status = 0; /* sync status */
> + int ret = 0;
> unsigned i;
> objio_done_fn saved_done_fn = ios->done;
> bool sync = ios->ol_state.sync;
> @@ -771,16 +771,16 @@ static ssize_t _io_exec(struct objio_state *ios)
>
> if (sync) {
> wait_for_completion(&wait);
> - status = saved_done_fn(ios);
> + ret = saved_done_fn(ios);
> }
>
> - return status;
> + return ret;
> }
>
> /*
> * read
> */
> -static ssize_t _read_done(struct objio_state *ios)
> +static int _read_done(struct objio_state *ios)
> {
> ssize_t status;
> int ret = _io_check(ios, false);
> @@ -793,7 +793,7 @@ static ssize_t _read_done(struct objio_state *ios)
> status = ret;
>
> objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync);
> - return status;
> + return ret;
> }
>
> static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
> @@ -833,7 +833,7 @@ err:
> return ret;
> }
>
> -static ssize_t _read_exec(struct objio_state *ios)
> +static int _read_exec(struct objio_state *ios)
> {
> unsigned i;
> int ret;
> @@ -847,14 +847,14 @@ static ssize_t _read_exec(struct objio_state *ios)
> }
>
> ios->done = _read_done;
> - return _io_exec(ios); /* In sync mode exec returns the io status */
> + return _io_exec(ios);
>
> err:
> _io_free(ios);
> return ret;
> }
>
> -ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state)
> +int objio_read_pagelist(struct objlayout_io_state *ol_state)
> {
> struct objio_state *ios = container_of(ol_state, struct objio_state,
> ol_state);
> @@ -870,7 +870,7 @@ ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state)
> /*
> * write
> */
> -static ssize_t _write_done(struct objio_state *ios)
> +static int _write_done(struct objio_state *ios)
> {
> ssize_t status;
> int ret = _io_check(ios, true);
> @@ -887,7 +887,7 @@ static ssize_t _write_done(struct objio_state *ios)
> }
>
> objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync);
> - return status;
> + return ret;
> }
>
> static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
> @@ -955,7 +955,7 @@ err:
> return ret;
> }
>
> -static ssize_t _write_exec(struct objio_state *ios)
> +static int _write_exec(struct objio_state *ios)
> {
> unsigned i;
> int ret;
> @@ -969,14 +969,14 @@ static ssize_t _write_exec(struct objio_state *ios)
> }
>
> ios->done = _write_done;
> - return _io_exec(ios); /* In sync mode exec returns the io->status */
> + return _io_exec(ios);
>
> err:
> _io_free(ios);
> return ret;
> }
>
> -ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable)
> +int objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable)
> {
> struct objio_state *ios = container_of(ol_state, struct objio_state,
> ol_state);
> diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
> index 1300736..99c807d 100644
> --- a/fs/nfs/objlayout/objlayout.c
> +++ b/fs/nfs/objlayout/objlayout.c
> @@ -315,16 +315,13 @@ objlayout_read_pagelist(struct nfs_read_data *rdata)
> loff_t offset = rdata->args.offset;
> size_t count = rdata->args.count;
> struct objlayout_io_state *state;
> - ssize_t status = 0;
> + int err;
> loff_t eof;
>
> - dprintk("%s: Begin inode %p offset %llu count %d\n",
> - __func__, rdata->inode, offset, (int)count);
> -
> eof = i_size_read(rdata->inode);
> if (unlikely(offset + count > eof)) {
> if (offset >= eof) {
> - status = 0;
> + err = 0;
> rdata->res.count = 0;
> rdata->res.eof = 1;
> /*FIXME: do we need to call pnfs_ld_read_done() */
> @@ -341,14 +338,19 @@ objlayout_read_pagelist(struct nfs_read_data *rdata)
> rdata->lseg, rdata,
> GFP_KERNEL);
> if (unlikely(!state)) {
> - status = -ENOMEM;
> + err = -ENOMEM;
> goto out;
> }
> + dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n",
> + __func__, rdata->inode->i_ino, offset, count, rdata->res.eof);
>
> - status = objio_read_pagelist(state);
> + err = objio_read_pagelist(state);
> out:
> - dprintk("%s: Return status %Zd\n", __func__, status);
> - rdata->pnfs_error = status;
> + if (unlikely(err)) {
> + rdata->pnfs_error = err;
> + dprintk("%s: Returned Error %d\n", __func__, err);
> + return PNFS_NOT_ATTEMPTED;
> + }
> return PNFS_ATTEMPTED;
> }
>
> @@ -406,10 +408,7 @@ objlayout_write_pagelist(struct nfs_write_data *wdata,
> int how)
> {
> struct objlayout_io_state *state;
> - ssize_t status;
> -
> - dprintk("%s: Begin inode %p offset %llu count %u\n",
> - __func__, wdata->inode, wdata->args.offset, wdata->args.count);
> + int err;
>
> state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout,
> wdata->args.pages,
> @@ -419,16 +418,19 @@ objlayout_write_pagelist(struct nfs_write_data *wdata,
> wdata->lseg, wdata,
> GFP_NOFS);
> if (unlikely(!state)) {
> - status = -ENOMEM;
> + err = -ENOMEM;
> goto out;
> }
>
> state->sync = how & FLUSH_SYNC;
>
> - status = objio_write_pagelist(state, how & FLUSH_STABLE);
> + err = objio_write_pagelist(state, how & FLUSH_STABLE);
> out:
> - dprintk("%s: Return status %Zd\n", __func__, status);
> - wdata->pnfs_error = status;
> + if (unlikely(err)) {
> + wdata->pnfs_error = err;
> + dprintk("%s: Returned Error %d\n", __func__, err);
> + return PNFS_NOT_ATTEMPTED;
> + }
> return PNFS_ATTEMPTED;
> }
>
> diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
> index ffb884c..4edac9b 100644
> --- a/fs/nfs/objlayout/objlayout.h
> +++ b/fs/nfs/objlayout/objlayout.h
> @@ -115,8 +115,8 @@ extern int objio_alloc_io_state(
> gfp_t gfp_flags);
> extern void objio_free_io_state(struct objlayout_io_state *state);
>
> -extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state);
> -extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state,
> +extern int objio_read_pagelist(struct objlayout_io_state *ol_state);
> +extern int objio_write_pagelist(struct objlayout_io_state *ol_state,
> bool stable);
>
> /*

2011-10-11 02:31:12

by Benny Halevy

[permalink] [raw]
Subject: Re: [PATCH 13/19] pnfs-obj: Remove redundant EOF from objlayout_io_state

On 2011-10-04 06:34, Boaz Harrosh wrote:
> The EOF calculation was done on .read_pagelist(), cached
> in objlayout_io_state->eof, and set in objlayout_read_done()
> into nfs_read_data->res.eof.
>
> So set it directly into nfs_read_data->res.eof and avoid
> the extra member.
>
> This is a slight behaviour change because before eof was
> *not* set on an error update at objlayout_read_done(). But
> is that a problem? Is Generic layer so sensitive that it
> will miss the error IO if eof was set? From my testing
> I did not see such a problem.
>
> Benny please review.
>
> Which brings me to a more abstract problem. Why does the
> LAYOUT driver needs to do this eof calculation? .i.e we
> are inspecting generic i_size_read() and if spanned by
> offset + count which is received from generic layer we set
> eof. It looks like all this can/should be done in generic
> layer and not at LD. Where does NFS and files-LD do it?
> It looks like it can be promoted.

In the files layout case, nfs_read_done sets res.eof.
But I agree this code could be moved to the generic layout
at least to serve non-rpc LDs.

And BTW, current the object layout handling of the eof flag
is stricter than the blocks layout and it requires an extra
call with offset >= i_size to set the eof flag, while for
nfs and blocks eof is set when offset + count >= i_size

>
> Signed-off-by: Boaz Harrosh <[email protected]>

Reviewed-by: Benny Halevy <[email protected]>

> ---
> fs/nfs/objlayout/objlayout.c | 16 +++++++---------
> fs/nfs/objlayout/objlayout.h | 1 -
> 2 files changed, 7 insertions(+), 10 deletions(-)
>
> diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
> index 1d06f8e..1300736 100644
> --- a/fs/nfs/objlayout/objlayout.c
> +++ b/fs/nfs/objlayout/objlayout.c
> @@ -287,17 +287,14 @@ static void _rpc_read_complete(struct work_struct *work)
> void
> objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync)
> {
> - int eof = state->eof;
> - struct nfs_read_data *rdata;
> + struct nfs_read_data *rdata = state->rpcdata;
>
> state->status = status;
> - dprintk("%s: Begin status=%zd eof=%d\n", __func__, status, eof);
> - rdata = state->rpcdata;
> + dprintk("%s: Begin status=%zd eof=%d\n", __func__,
> + status, rdata->res.eof);
> rdata->task.tk_status = status;
> - if (status >= 0) {
> + if (status >= 0)
> rdata->res.count = status;
> - rdata->res.eof = eof;
> - }
> objlayout_iodone(state);
> /* must not use state after this point */
>
> @@ -330,11 +327,14 @@ objlayout_read_pagelist(struct nfs_read_data *rdata)
> status = 0;
> rdata->res.count = 0;
> rdata->res.eof = 1;
> + /*FIXME: do we need to call pnfs_ld_read_done() */

Yes, it looks like we do, otherwise we might leak a refcount on the lseg.
We also need to set rdata->task.tk_status = 0, to mimic what objlayout_read_done
would have done in the sync case.

Benny

> goto out;
> }
> count = eof - offset;
> }
>
> + rdata->res.eof = (offset + count) >= eof;
> +
> state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout,
> rdata->args.pages, rdata->args.pgbase,
> offset, count,
> @@ -345,8 +345,6 @@ objlayout_read_pagelist(struct nfs_read_data *rdata)
> goto out;
> }
>
> - state->eof = state->offset + state->count >= eof;
> -
> status = objio_read_pagelist(state);
> out:
> dprintk("%s: Return status %Zd\n", __func__, status);
> diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
> index a8244c8..ffb884c 100644
> --- a/fs/nfs/objlayout/objlayout.h
> +++ b/fs/nfs/objlayout/objlayout.h
> @@ -86,7 +86,6 @@ struct objlayout_io_state {
>
> void *rpcdata;
> int status; /* res */
> - int eof; /* res */
> int committed; /* res */
>
> /* Error reporting (layout_return) */

2011-10-04 10:32:50

by Boaz Harrosh

[permalink] [raw]
Subject: [PATCH 09/19] ore: Support for short read/writes

Memory conditions and max_bio constraints might cause us to
not comply to the full length of the requested IO. Instead of
failing the complete IO we can issue a shorter read/write and
report how much was actually executed in the ios->length
member.

All users must check ios->length at IO_done or upon return of
ore_read/write and re-issue the reminder of the bytes. Because
other wise there is no error returned like before.

This is part of the effort to support the pnfs-obj layout driver.

Signed-off-by: Boaz Harrosh <[email protected]>
---
fs/exofs/ore.c | 30 +++++++++++++++++++++++-------
1 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 2f39f23..0b992e1 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -377,8 +377,8 @@ static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
unsigned pg = *cur_pg;
struct request_queue *q =
osd_request_queue(_ios_od(ios, per_dev->dev));
-
- per_dev->length += cur_len;
+ unsigned len = cur_len;
+ int ret;

if (per_dev->bio == NULL) {
unsigned pages_in_stripe = ios->layout->group_width *
@@ -390,7 +390,8 @@ static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
if (unlikely(!per_dev->bio)) {
ORE_DBGMSG("Failed to allocate BIO size=%u\n",
bio_size);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
}
}

@@ -403,15 +404,24 @@ static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,

added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg],
pglen, pgbase);
- if (unlikely(pglen != added_len))
- return -ENOMEM;
+ if (unlikely(pglen != added_len)) {
+ ret = -ENOMEM;
+ goto out;
+ }
pgbase = 0;
++pg;
}
BUG_ON(cur_len);

+ per_dev->length += len;
*cur_pg = pg;
- return 0;
+ ret = 0;
+out: /* we fail the complete unit on an error eg don't advance
+ * per_dev->length and cur_pg. This means that we might have a bigger
+ * bio than the CDB requested length (per_dev->length). That's fine
+ * only the oposite is fatal.
+ */
+ return ret;
}

static int _prepare_for_striping(struct ore_io_state *ios)
@@ -476,7 +486,13 @@ static int _prepare_for_striping(struct ore_io_state *ios)
out:
ios->numdevs = max_comp + mirrors_p1;
ios->pages_consumed = cur_pg;
- return ret;
+ if (unlikely(ret)) {
+ if (length == ios->length)
+ return ret;
+ else
+ ios->length -= length;
+ }
+ return 0;
}

int ore_create(struct ore_io_state *ios)
--
1.7.2.3


2011-10-04 10:34:23

by Boaz Harrosh

[permalink] [raw]
Subject: [PATCH 11/19] ore/exofs: Define new ore_verify_layout

All users of the ore will need to check if current code
supports the given layout. For example RAID5/6 is not
currently supported.

So move all the checks from exofs/super.c to a new
ore_verify_layout() to be used by ore users.

Note that any new layout should be passed through the
ore_verify_layout() because the ore engine will prepare
and verify some internal members of ore_layout, and
assumes it's called.

Signed-off-by: Boaz Harrosh <[email protected]>
---
fs/exofs/inode.c | 9 +-----
fs/exofs/ore.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++
fs/exofs/super.c | 49 ++---------------------------------
include/scsi/osd_ore.h | 8 +++++
4 files changed, 80 insertions(+), 53 deletions(-)

diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 0afe761..7b1a4ba 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -37,11 +37,7 @@

#define EXOFS_DBGMSG2(M...) do {} while (0)

-enum { BIO_MAX_PAGES_KMALLOC =
- (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
- MAX_PAGES_KMALLOC =
- PAGE_SIZE / sizeof(struct page *),
-};
+enum {MAX_PAGES_KMALLOC = PAGE_SIZE / sizeof(struct page *), };

unsigned exofs_max_io_pages(struct ore_layout *layout,
unsigned expected_pages)
@@ -49,8 +45,7 @@ unsigned exofs_max_io_pages(struct ore_layout *layout,
unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC);

/* TODO: easily support bio chaining */
- pages = min_t(unsigned, pages,
- layout->group_width * BIO_MAX_PAGES_KMALLOC);
+ pages = min_t(unsigned, pages, layout->max_io_length / PAGE_SIZE);
return pages;
}

diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 7913168..14a9100 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -47,9 +47,76 @@ MODULE_AUTHOR("Boaz Harrosh <[email protected]>");
MODULE_DESCRIPTION("Objects Raid Engine ore.ko");
MODULE_LICENSE("GPL");

+/* ore_verify_layout does a couple of things:
+ * 1. Given a minimum number of needed parameters fixes up the rest of the
+ * members to be operatonals for the ore. The needed parameters are those
+ * that are defined by the pnfs-objects layout STD.
+ * 2. Check to see if the current ore code actually supports these parameters
+ * for example stripe_unit must be a multple of the system PAGE_SIZE,
+ * and etc...
+ * 3. Cache some havily used calculations that will be needed by users.
+ */
+
static void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
struct ore_striping_info *si);

+enum { BIO_MAX_PAGES_KMALLOC =
+ (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),};
+
+int ore_verify_layout(unsigned total_comps, struct ore_layout *layout)
+{
+ u64 stripe_length;
+
+/* FIXME: Only raid0 is supported for now. */
+ if (layout->raid_algorithm != PNFS_OSD_RAID_0) {
+ ORE_ERR("Only RAID_0 for now\n");
+ return -EINVAL;
+ }
+ if (0 != (layout->stripe_unit & ~PAGE_MASK)) {
+ ORE_ERR("Stripe Unit(0x%llx)"
+ " must be Multples of PAGE_SIZE(0x%lx)\n",
+ _LLU(layout->stripe_unit), PAGE_SIZE);
+ return -EINVAL;
+ }
+ if (layout->group_width) {
+ if (!layout->group_depth) {
+ ORE_ERR("group_depth == 0 && group_width != 0\n");
+ return -EINVAL;
+ }
+ if (total_comps < (layout->group_width * layout->mirrors_p1)) {
+ ORE_ERR("Data Map wrong, "
+ "numdevs=%d < group_width=%d * mirrors=%d\n",
+ total_comps, layout->group_width,
+ layout->mirrors_p1);
+ return -EINVAL;
+ }
+ layout->group_count = total_comps / layout->mirrors_p1 /
+ layout->group_width;
+ } else {
+ if (layout->group_depth) {
+ printk(KERN_NOTICE "Warning: group_depth ignored "
+ "group_width == 0 && group_depth == %lld\n",
+ _LLU(layout->group_depth));
+ }
+ layout->group_width = total_comps / layout->mirrors_p1;
+ layout->group_depth = -1;
+ layout->group_count = 1;
+ }
+
+ stripe_length = (u64)layout->group_width * layout->stripe_unit;
+ if (stripe_length >= (1ULL << 32)) {
+ ORE_ERR("Stripe_length(0x%llx) >= 32bit is not supported\n",
+ _LLU(stripe_length));
+ return -EINVAL;
+ }
+
+ layout->max_io_length =
+ (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) *
+ layout->group_width;
+ return 0;
+}
+EXPORT_SYMBOL(ore_verify_layout);
+
static u8 *_ios_cred(struct ore_io_state *ios, unsigned index)
{
return ios->oc->comps[index & ios->oc->single_comp].cred;
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index bce3686..057b237 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -480,7 +480,7 @@ static void exofs_put_super(struct super_block *sb)
static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
struct exofs_device_table *dt)
{
- u64 stripe_length;
+ int ret;

sbi->layout.stripe_unit =
le64_to_cpu(dt->dt_data_map.cb_stripe_unit);
@@ -493,50 +493,7 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
sbi->layout.raid_algorithm =
le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);

-/* FIXME: Only raid0 for now. if not so, do not mount */
- if (sbi->layout.raid_algorithm != PNFS_OSD_RAID_0) {
- EXOFS_ERR("Only RAID_0 for now\n");
- return -EINVAL;
- }
- if (numdevs < (sbi->layout.group_width * sbi->layout.mirrors_p1)) {
- EXOFS_ERR("Data Map wrong, "
- "numdevs=%d < group_width=%d * mirrors=%d\n",
- numdevs, sbi->layout.group_width,
- sbi->layout.mirrors_p1);
- return -EINVAL;
- }
-
- if (0 != (sbi->layout.stripe_unit & ~PAGE_MASK)) {
- EXOFS_ERR("Stripe Unit(0x%llx)"
- " must be Multples of PAGE_SIZE(0x%lx)\n",
- _LLU(sbi->layout.stripe_unit), PAGE_SIZE);
- return -EINVAL;
- }
-
- if (sbi->layout.group_width) {
- if (!sbi->layout.group_depth) {
- EXOFS_ERR("group_depth == 0 && group_width != 0\n");
- return -EINVAL;
- }
- sbi->layout.group_count = numdevs / sbi->layout.mirrors_p1 /
- sbi->layout.group_width;
- } else {
- if (sbi->layout.group_depth) {
- printk(KERN_NOTICE "Warning: group_depth ignored "
- "group_width == 0 && group_depth == %lld\n",
- _LLU(sbi->layout.group_depth));
- }
- sbi->layout.group_width = numdevs / sbi->layout.mirrors_p1;
- sbi->layout.group_depth = -1;
- sbi->layout.group_count = 1;
- }
-
- stripe_length = (u64)sbi->layout.group_width * sbi->layout.stripe_unit;
- if (stripe_length >= (1ULL << 32)) {
- EXOFS_ERR("Total Stripe length(0x%llx)"
- " >= 32bit is not supported\n", _LLU(stripe_length));
- return -EINVAL;
- }
+ ret = ore_verify_layout(numdevs, &sbi->layout);

EXOFS_DBGMSG("exofs: layout: "
"num_comps=%u stripe_unit=0x%x group_width=%u "
@@ -547,7 +504,7 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
_LLU(sbi->layout.group_depth),
sbi->layout.mirrors_p1,
sbi->layout.raid_algorithm);
- return 0;
+ return ret;
}

static unsigned __ra_pages(struct ore_layout *layout)
diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h
index 492b70d..716dbea 100644
--- a/include/scsi/osd_ore.h
+++ b/include/scsi/osd_ore.h
@@ -42,6 +42,13 @@ struct ore_layout {
unsigned group_width;
u64 group_depth;
unsigned group_count;
+
+ /* Cached often needed calculations filled in by
+ * ore_verify_layout
+ */
+ unsigned long max_io_length; /* Max length that should be passed to
+ * ore_get_rw_state
+ */
};

struct ore_dev {
@@ -138,6 +145,7 @@ static inline unsigned ore_io_state_size(unsigned numdevs)
}

/* ore.c */
+int ore_verify_layout(unsigned total_comps, struct ore_layout *layout);
int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps,
bool is_reading, u64 offset, u64 length,
struct ore_io_state **ios);
--
1.7.2.3


2011-10-04 10:28:55

by Boaz Harrosh

[permalink] [raw]
Subject: [PATCH 02/19] exofs: Remove unused data_map member from exofs_sb_info

The struct pnfs_osd_data_map data_map member of exofs_sb_info was
never used after mount. In fact all it's members were duplicated
by the ore_layout structure. So just remove the duplicated information.

Also removed some stupid, but perfectly supported, restrictions on
layout parameters. The case where num_devices is not divisible by
mirror_count+1 is perfectly fine since the rotating device view
will eventually use all the devices it can get.

Signed-off-by: Boaz Harrosh <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
---
fs/exofs/exofs.h | 3 --
fs/exofs/super.c | 57 ++++++++++++++++++-----------------------------
include/scsi/osd_ore.h | 2 +
3 files changed, 24 insertions(+), 38 deletions(-)

diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index c09d5a7..3b2e047 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -66,9 +66,6 @@ struct exofs_sb_info {
u32 s_next_generation; /* next gen # to use */
atomic_t s_curr_pending; /* number of pending commands */

- struct pnfs_osd_data_map data_map; /* Default raid to use
- * FIXME: Needed ?
- */
struct ore_layout layout; /* Default files layout */
struct ore_comp one_comp; /* id & cred of partition id=0*/
struct ore_components oc; /* comps for the partition */
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index babb195..90b4c52 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -481,64 +481,51 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
{
u64 stripe_length;

- sbi->data_map.odm_num_comps =
- le32_to_cpu(dt->dt_data_map.cb_num_comps);
- sbi->data_map.odm_stripe_unit =
+ sbi->layout.stripe_unit =
le64_to_cpu(dt->dt_data_map.cb_stripe_unit);
- sbi->data_map.odm_group_width =
+ sbi->layout.group_width =
le32_to_cpu(dt->dt_data_map.cb_group_width);
- sbi->data_map.odm_group_depth =
+ sbi->layout.group_depth =
le32_to_cpu(dt->dt_data_map.cb_group_depth);
- sbi->data_map.odm_mirror_cnt =
- le32_to_cpu(dt->dt_data_map.cb_mirror_cnt);
- sbi->data_map.odm_raid_algorithm =
+ sbi->layout.mirrors_p1 =
+ le32_to_cpu(dt->dt_data_map.cb_mirror_cnt) + 1;
+ sbi->layout.raid_algorithm =
le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);

/* FIXME: Only raid0 for now. if not so, do not mount */
- if (sbi->data_map.odm_num_comps != numdevs) {
- EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n",
- sbi->data_map.odm_num_comps, numdevs);
- return -EINVAL;
- }
- if (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) {
+ if (sbi->layout.raid_algorithm != PNFS_OSD_RAID_0) {
EXOFS_ERR("Only RAID_0 for now\n");
return -EINVAL;
}
- if (0 != (numdevs % (sbi->data_map.odm_mirror_cnt + 1))) {
- EXOFS_ERR("Data Map wrong, numdevs=%d mirrors=%d\n",
- numdevs, sbi->data_map.odm_mirror_cnt);
+ if (numdevs < (sbi->layout.group_width * sbi->layout.mirrors_p1)) {
+ EXOFS_ERR("Data Map wrong, "
+ "numdevs=%d < group_width=%d * mirrors=%d\n",
+ numdevs, sbi->layout.group_width,
+ sbi->layout.mirrors_p1);
return -EINVAL;
}

- if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) {
+ if (0 != (sbi->layout.stripe_unit & ~PAGE_MASK)) {
EXOFS_ERR("Stripe Unit(0x%llx)"
" must be Multples of PAGE_SIZE(0x%lx)\n",
- _LLU(sbi->data_map.odm_stripe_unit), PAGE_SIZE);
+ _LLU(sbi->layout.stripe_unit), PAGE_SIZE);
return -EINVAL;
}

- sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit;
- sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1;
-
- if (sbi->data_map.odm_group_width) {
- sbi->layout.group_width = sbi->data_map.odm_group_width;
- sbi->layout.group_depth = sbi->data_map.odm_group_depth;
+ if (sbi->layout.group_width) {
if (!sbi->layout.group_depth) {
EXOFS_ERR("group_depth == 0 && group_width != 0\n");
return -EINVAL;
}
- sbi->layout.group_count = sbi->data_map.odm_num_comps /
- sbi->layout.mirrors_p1 /
- sbi->data_map.odm_group_width;
+ sbi->layout.group_count = numdevs / sbi->layout.mirrors_p1 /
+ sbi->layout.group_width;
} else {
- if (sbi->data_map.odm_group_depth) {
+ if (sbi->layout.group_depth) {
printk(KERN_NOTICE "Warning: group_depth ignored "
- "group_width == 0 && group_depth == %d\n",
- sbi->data_map.odm_group_depth);
- sbi->data_map.odm_group_depth = 0;
+ "group_width == 0 && group_depth == %lld\n",
+ _LLU(sbi->layout.group_depth));
}
- sbi->layout.group_width = sbi->data_map.odm_num_comps /
- sbi->layout.mirrors_p1;
+ sbi->layout.group_width = numdevs / sbi->layout.mirrors_p1;
sbi->layout.group_depth = -1;
sbi->layout.group_count = 1;
}
@@ -558,7 +545,7 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
sbi->layout.group_width,
_LLU(sbi->layout.group_depth),
sbi->layout.mirrors_p1,
- sbi->data_map.odm_raid_algorithm);
+ sbi->layout.raid_algorithm);
return 0;
}

diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h
index 954292a..f7fabb4 100644
--- a/include/scsi/osd_ore.h
+++ b/include/scsi/osd_ore.h
@@ -34,6 +34,8 @@ struct ore_comp {

struct ore_layout {
/* Our way of looking at the data_map */
+ enum pnfs_osd_raid_algorithm4
+ raid_algorithm;
unsigned stripe_unit;
unsigned mirrors_p1;

--
1.7.2.3


2011-10-04 10:36:31

by Boaz Harrosh

[permalink] [raw]
Subject: [PATCH 17/19] pnfs-obj: move to ore 01: ore_layout & ore_components

For Ease of reviewing I split the move to ore into 3 parts
move to ore 01: ore_layout & ore_components
move to ore 02: move to ORE
move to ore 03: Remove old raid engine

This patch modifies the objio_lseg, layout-segment level
and devices and components arrays to use the ORE types.

Though it will be removed soon, also the raid engine
is modified to actually compile, possibly run, with
the new types. So it is the same old raid engine but
with some new ORE types.

For Ease of reviewing, some of the old code is
"#if 0" but is not removed so the diff command works
better. The old code will be removed in the 3rd patch.

Signed-off-by: Boaz Harrosh <[email protected]>
---
fs/nfs/objlayout/objio_osd.c | 272 ++++++++++++++++++++----------------------
1 files changed, 128 insertions(+), 144 deletions(-)

diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 2347e0a..bd7ec26 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -38,7 +38,7 @@
*/

#include <linux/module.h>
-#include <scsi/osd_initiator.h>
+#include <scsi/osd_ore.h>

#include "objlayout.h"

@@ -52,7 +52,7 @@ enum { BIO_MAX_PAGES_KMALLOC =

struct objio_dev_ent {
struct nfs4_deviceid_node id_node;
- struct osd_dev *od;
+ struct ore_dev od;
};

static void
@@ -60,8 +60,8 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d)
{
struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node);

- dprintk("%s: free od=%p\n", __func__, de->od);
- osduld_put_device(de->od);
+ dprintk("%s: free od=%p\n", __func__, de->od.od);
+ osduld_put_device(de->od.od);
kfree(de);
}

@@ -98,12 +98,12 @@ _dev_list_add(const struct nfs_server *nfss,
nfss->pnfs_curr_ld,
nfss->nfs_client,
d_id);
- de->od = od;
+ de->od.od = od;

d = nfs4_insert_deviceid_node(&de->id_node);
n = container_of(d, struct objio_dev_ent, id_node);
if (n != de) {
- dprintk("%s: Race with other n->od=%p\n", __func__, n->od);
+ dprintk("%s: Race with other n->od=%p\n", __func__, n->od.od);
objio_free_deviceid_node(&de->id_node);
de = n;
}
@@ -111,28 +111,11 @@ _dev_list_add(const struct nfs_server *nfss,
return de;
}

-struct caps_buffers {
- u8 caps_key[OSD_CRYPTO_KEYID_SIZE];
- u8 creds[OSD_CAP_LEN];
-};
-
struct objio_segment {
struct pnfs_layout_segment lseg;

- struct pnfs_osd_object_cred *comps;
-
- unsigned mirrors_p1;
- unsigned stripe_unit;
- unsigned group_width; /* Data stripe_units without integrity comps */
- u64 group_depth;
- unsigned group_count;
-
- unsigned max_io_size;
-
- unsigned comps_index;
- unsigned num_comps;
- /* variable length */
- struct objio_dev_ent *ods[];
+ struct ore_layout layout;
+ struct ore_components oc;
};

static inline struct objio_segment *
@@ -155,7 +138,8 @@ struct objio_state {
loff_t offset;
bool sync;

- struct objio_segment *layout;
+ struct ore_layout *layout;
+ struct ore_components *oc;

struct kref kref;
objio_done_fn done;
@@ -175,32 +159,33 @@ struct objio_state {

/* Send and wait for a get_device_info of devices in the layout,
then look them up with the osd_initiator library */
-static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay,
- struct objio_segment *objio_seg, unsigned comp,
- gfp_t gfp_flags)
+static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
+ struct objio_segment *objio_seg, unsigned c, struct nfs4_deviceid *d_id,
+ gfp_t gfp_flags)
{
struct pnfs_osd_deviceaddr *deviceaddr;
- struct nfs4_deviceid *d_id;
struct objio_dev_ent *ode;
struct osd_dev *od;
struct osd_dev_info odi;
int err;

- d_id = &objio_seg->comps[comp].oc_object_id.oid_device_id;
-
ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id);
- if (ode)
- return ode;
+ if (ode) {
+ objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
+ return 0;
+ }

err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags);
if (unlikely(err)) {
dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
__func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err);
- return ERR_PTR(err);
+ return err;
}

odi.systemid_len = deviceaddr->oda_systemid.len;
if (odi.systemid_len > sizeof(odi.systemid)) {
+ dprintk("%s: odi.systemid_len > sizeof(systemid=%zd)\n",
+ __func__, sizeof(odi.systemid));
err = -EINVAL;
goto out;
} else if (odi.systemid_len)
@@ -225,38 +210,15 @@ static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay,

ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od,
gfp_flags);
-
+ objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
+ dprintk("Adding new dev_id(%llx:%llx)\n",
+ _DEVID_LO(d_id), _DEVID_HI(d_id));
out:
- dprintk("%s: return=%d\n", __func__, err);
objlayout_put_deviceinfo(deviceaddr);
- return err ? ERR_PTR(err) : ode;
-}
-
-static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
- struct objio_segment *objio_seg,
- gfp_t gfp_flags)
-{
- unsigned i;
- int err;
-
- /* lookup all devices */
- for (i = 0; i < objio_seg->num_comps; i++) {
- struct objio_dev_ent *ode;
-
- ode = _device_lookup(pnfslay, objio_seg, i, gfp_flags);
- if (unlikely(IS_ERR(ode))) {
- err = PTR_ERR(ode);
- goto out;
- }
- objio_seg->ods[i] = ode;
- }
- err = 0;
-
-out:
- dprintk("%s: return=%d\n", __func__, err);
return err;
}

+#if 0
static int _verify_data_map(struct pnfs_osd_layout *layout)
{
struct pnfs_osd_data_map *data_map = &layout->olo_map;
@@ -296,23 +258,45 @@ static int _verify_data_map(struct pnfs_osd_layout *layout)

return 0;
}
+#endif

-static void copy_single_comp(struct pnfs_osd_object_cred *cur_comp,
- struct pnfs_osd_object_cred *src_comp,
- struct caps_buffers *caps_p)
+static void copy_single_comp(struct ore_components *oc, unsigned c,
+ struct pnfs_osd_object_cred *src_comp)
{
- WARN_ON(src_comp->oc_cap_key.cred_len > sizeof(caps_p->caps_key));
- WARN_ON(src_comp->oc_cap.cred_len > sizeof(caps_p->creds));
+ struct ore_comp *ocomp = &oc->comps[c];

- *cur_comp = *src_comp;
+ WARN_ON(src_comp->oc_cap_key.cred_len > 0); /* libosd is NO_SEC only */
+ WARN_ON(src_comp->oc_cap.cred_len > sizeof(ocomp->cred));

- memcpy(caps_p->caps_key, src_comp->oc_cap_key.cred,
- sizeof(caps_p->caps_key));
- cur_comp->oc_cap_key.cred = caps_p->caps_key;
+ ocomp->obj.partition = src_comp->oc_object_id.oid_partition_id;
+ ocomp->obj.id = src_comp->oc_object_id.oid_object_id;

- memcpy(caps_p->creds, src_comp->oc_cap.cred,
- sizeof(caps_p->creds));
- cur_comp->oc_cap.cred = caps_p->creds;
+ memcpy(ocomp->cred, src_comp->oc_cap.cred, sizeof(ocomp->cred));
+}
+
+int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags,
+ struct objio_segment **pseg)
+{
+ struct __alloc_objio_segment {
+ struct objio_segment olseg;
+ struct ore_dev *ods[numdevs];
+ struct ore_comp comps[numdevs];
+ } *aolseg;
+
+ aolseg = kzalloc(sizeof(*aolseg), gfp_flags);
+ if (unlikely(!aolseg)) {
+ dprintk("%s: Faild allocation numdevs=%d size=%zd\n", __func__,
+ numdevs, sizeof(*aolseg));
+ return -ENOMEM;
+ }
+
+ aolseg->olseg.oc.numdevs = numdevs;
+ aolseg->olseg.oc.single_comp = EC_MULTPLE_COMPS;
+ aolseg->olseg.oc.comps = aolseg->comps;
+ aolseg->olseg.oc.ods = aolseg->ods;
+
+ *pseg = &aolseg->olseg;
+ return 0;
}

int objio_alloc_lseg(struct pnfs_layout_segment **outp,
@@ -324,59 +308,43 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp,
struct objio_segment *objio_seg;
struct pnfs_osd_xdr_decode_layout_iter iter;
struct pnfs_osd_layout layout;
- struct pnfs_osd_object_cred *cur_comp, src_comp;
- struct caps_buffers *caps_p;
+ struct pnfs_osd_object_cred src_comp;
+ unsigned cur_comp;
int err;

err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr);
if (unlikely(err))
return err;

- err = _verify_data_map(&layout);
+ err = __alloc_objio_seg(layout.olo_num_comps, gfp_flags, &objio_seg);
if (unlikely(err))
return err;

- objio_seg = kzalloc(sizeof(*objio_seg) +
- sizeof(objio_seg->ods[0]) * layout.olo_num_comps +
- sizeof(*objio_seg->comps) * layout.olo_num_comps +
- sizeof(struct caps_buffers) * layout.olo_num_comps,
- gfp_flags);
- if (!objio_seg)
- return -ENOMEM;
+ objio_seg->layout.stripe_unit = layout.olo_map.odm_stripe_unit;
+ objio_seg->layout.group_width = layout.olo_map.odm_group_width;
+ objio_seg->layout.group_depth = layout.olo_map.odm_group_depth;
+ objio_seg->layout.mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
+ objio_seg->layout.raid_algorithm = layout.olo_map.odm_raid_algorithm;

- objio_seg->comps = (void *)(objio_seg->ods + layout.olo_num_comps);
- cur_comp = objio_seg->comps;
- caps_p = (void *)(cur_comp + layout.olo_num_comps);
- while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err))
- copy_single_comp(cur_comp++, &src_comp, caps_p++);
+ err = ore_verify_layout(layout.olo_map.odm_num_comps,
+ &objio_seg->layout);
if (unlikely(err))
goto err;

- objio_seg->num_comps = layout.olo_num_comps;
- objio_seg->comps_index = layout.olo_comps_index;
- err = objio_devices_lookup(pnfslay, objio_seg, gfp_flags);
- if (err)
- goto err;
-
- objio_seg->mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
- objio_seg->stripe_unit = layout.olo_map.odm_stripe_unit;
- if (layout.olo_map.odm_group_width) {
- objio_seg->group_width = layout.olo_map.odm_group_width;
- objio_seg->group_depth = layout.olo_map.odm_group_depth;
- objio_seg->group_count = layout.olo_map.odm_num_comps /
- objio_seg->mirrors_p1 /
- objio_seg->group_width;
- } else {
- objio_seg->group_width = layout.olo_map.odm_num_comps /
- objio_seg->mirrors_p1;
- objio_seg->group_depth = -1;
- objio_seg->group_count = 1;
+ objio_seg->oc.first_dev = layout.olo_comps_index;
+ cur_comp = 0;
+ while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) {
+ copy_single_comp(&objio_seg->oc, cur_comp, &src_comp);
+ err = objio_devices_lookup(pnfslay, objio_seg, cur_comp,
+ &src_comp.oc_object_id.oid_device_id,
+ gfp_flags);
+ if (err)
+ goto err;
+ ++cur_comp;
}
-
- /* Cache this calculation it will hit for every page */
- objio_seg->max_io_size = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE -
- objio_seg->stripe_unit) *
- objio_seg->group_width;
+ /* pnfs_osd_xdr_decode_layout_comp returns false on error */
+ if (unlikely(err))
+ goto err;

*outp = &objio_seg->lseg;
return 0;
@@ -393,10 +361,14 @@ void objio_free_lseg(struct pnfs_layout_segment *lseg)
int i;
struct objio_segment *objio_seg = OBJIO_LSEG(lseg);

- for (i = 0; i < objio_seg->num_comps; i++) {
- if (!objio_seg->ods[i])
+ for (i = 0; i < objio_seg->oc.numdevs; i++) {
+ struct ore_dev *od = objio_seg->oc.ods[i];
+ struct objio_dev_ent *ode;
+
+ if (!od)
break;
- nfs4_put_deviceid_node(&objio_seg->ods[i]->id_node);
+ ode = container_of(od, typeof(*ode), od);
+ nfs4_put_deviceid_node(&ode->id_node);
}
kfree(objio_seg);
}
@@ -411,8 +383,8 @@ objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
struct objio_state *ios;
struct __alloc_objio_state {
struct objio_state objios;
- struct _objio_per_comp per_dev[objio_seg->num_comps];
- struct pnfs_osd_ioerr ioerrs[objio_seg->num_comps];
+ struct _objio_per_comp per_dev[objio_seg->oc.numdevs];
+ struct pnfs_osd_ioerr ioerrs[objio_seg->oc.numdevs];
} *aos;

aos = kzalloc(sizeof(*aos), gfp_flags);
@@ -421,8 +393,9 @@ objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,

ios = &aos->objios;

- ios->layout = objio_seg;
- objlayout_init_ioerrs(&aos->objios.oir, objio_seg->num_comps,
+ ios->layout = &objio_seg->layout;
+ ios->oc = &objio_seg->oc;
+ objlayout_init_ioerrs(&aos->objios.oir, objio_seg->oc.numdevs,
aos->ioerrs, rpcdata, pnfs_layout_type);

ios->pages = pages;
@@ -474,6 +447,27 @@ enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
}
}

+static void __on_dev_error(struct objio_state *ios, bool is_write,
+ struct ore_dev *od, unsigned dev_index, enum osd_err_priority oep,
+ u64 dev_offset, u64 dev_len)
+{
+ struct objio_state *objios = ios->private;
+ struct pnfs_osd_objid pooid;
+ struct objio_dev_ent *ode = container_of(od, typeof(*ode), od);
+ /* FIXME: what to do with more-then-one-group layouts. We need to
+ * translate from ore_io_state index to oc->comps index
+ */
+ unsigned comp = dev_index;
+
+ pooid.oid_device_id = ode->id_node.deviceid;
+ pooid.oid_partition_id = ios->oc->comps[comp].obj.partition;
+ pooid.oid_object_id = ios->oc->comps[comp].obj.id;
+
+ objlayout_io_set_result(&objios->oir, comp,
+ &pooid, osd_pri_2_pnfs_err(oep),
+ dev_offset, dev_len, is_write);
+}
+
static void _clear_bio(struct bio *bio)
{
struct bio_vec *bv;
@@ -518,12 +512,9 @@ static int _io_check(struct objio_state *ios, bool is_write)

continue; /* we recovered */
}
- objlayout_io_set_result(&ios->oir, i,
- &ios->layout->comps[i].oc_object_id,
- osd_pri_2_pnfs_err(osi.osd_err_pri),
- ios->per_dev[i].offset,
- ios->per_dev[i].length,
- is_write);
+ __on_dev_error(ios, is_write, ios->oc->ods[i],
+ ios->per_dev[i].dev, osi.osd_err_pri,
+ ios->per_dev[i].offset, ios->per_dev[i].length);

if (osi.osd_err_pri >= oep) {
oep = osi.osd_err_pri;
@@ -558,11 +549,11 @@ static void _io_free(struct objio_state *ios)

struct osd_dev *_io_od(struct objio_state *ios, unsigned dev)
{
- unsigned min_dev = ios->layout->comps_index;
- unsigned max_dev = min_dev + ios->layout->num_comps;
+ unsigned min_dev = ios->oc->first_dev;
+ unsigned max_dev = min_dev + ios->oc->numdevs;

BUG_ON(dev < min_dev || max_dev <= dev);
- return ios->layout->ods[dev - min_dev]->od;
+ return ios->oc->ods[dev - min_dev]->od;
}

struct _striping_info {
@@ -820,12 +811,9 @@ static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
struct osd_request *or = NULL;
struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
unsigned dev = per_dev->dev;
- struct pnfs_osd_object_cred *cred =
- &ios->layout->comps[cur_comp];
- struct osd_obj_id obj = {
- .partition = cred->oc_object_id.oid_partition_id,
- .id = cred->oc_object_id.oid_object_id,
- };
+ struct ore_comp *cred =
+ &ios->oc->comps[cur_comp];
+ struct osd_obj_id obj = cred->obj;
int ret;

or = osd_start_request(_io_od(ios, dev), GFP_KERNEL);
@@ -837,7 +825,7 @@ static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)

osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length);

- ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
+ ret = osd_finalize_request(or, 0, cred->cred, NULL);
if (ret) {
dprintk("%s: Faild to osd_finalize_request() => %d\n",
__func__, ret);
@@ -924,12 +912,8 @@ static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)

for (; cur_comp < last_comp; ++cur_comp, ++dev) {
struct osd_request *or = NULL;
- struct pnfs_osd_object_cred *cred =
- &ios->layout->comps[cur_comp];
- struct osd_obj_id obj = {
- .partition = cred->oc_object_id.oid_partition_id,
- .id = cred->oc_object_id.oid_object_id,
- };
+ struct ore_comp *cred = &ios->oc->comps[cur_comp];
+ struct osd_obj_id obj = cred->obj;
struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
struct bio *bio;

@@ -964,7 +948,7 @@ static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)

osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length);

- ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
+ ret = osd_finalize_request(or, 0, cred->cred, NULL);
if (ret) {
dprintk("%s: Faild to osd_finalize_request() => %d\n",
__func__, ret);
@@ -1030,7 +1014,7 @@ static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
return false;

return pgio->pg_count + req->wb_bytes <=
- OBJIO_LSEG(pgio->pg_lseg)->max_io_size;
+ OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length;
}

static const struct nfs_pageio_ops objio_pg_read_ops = {
--
1.7.2.3


2011-10-04 10:36:10

by Boaz Harrosh

[permalink] [raw]
Subject: [PATCH 16/19] pnfs-obj: Rename objlayout_io_state => objlayout_io_res

* All instances of objlayout_io_state => objlayout_io_res
* All instances of state => oir;
* All instances of ol_state => oir;

Big but nothing to it

Signed-off-by: Boaz Harrosh <[email protected]>
---
fs/nfs/objlayout/objio_osd.c | 17 +++++------
fs/nfs/objlayout/objlayout.c | 63 ++++++++++++++++++++---------------------
fs/nfs/objlayout/objlayout.h | 15 ++++++----
3 files changed, 48 insertions(+), 47 deletions(-)

diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 48eb91a..2347e0a 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -146,7 +146,7 @@ typedef int (*objio_done_fn)(struct objio_state *ios);

struct objio_state {
/* Generic layer */
- struct objlayout_io_state ol_state;
+ struct objlayout_io_res oir;

struct page **pages;
unsigned pgbase;
@@ -422,7 +422,7 @@ objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
ios = &aos->objios;

ios->layout = objio_seg;
- objlayout_init_ioerrs(&aos->objios.ol_state, objio_seg->num_comps,
+ objlayout_init_ioerrs(&aos->objios.oir, objio_seg->num_comps,
aos->ioerrs, rpcdata, pnfs_layout_type);

ios->pages = pages;
@@ -437,10 +437,9 @@ objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
return 0;
}

-void objio_free_result(struct objlayout_io_state *ol_state)
+void objio_free_result(struct objlayout_io_res *oir)
{
- struct objio_state *ios = container_of(ol_state, struct objio_state,
- ol_state);
+ struct objio_state *ios = container_of(oir, struct objio_state, oir);

kfree(ios);
}
@@ -519,7 +518,7 @@ static int _io_check(struct objio_state *ios, bool is_write)

continue; /* we recovered */
}
- objlayout_io_set_result(&ios->ol_state, i,
+ objlayout_io_set_result(&ios->oir, i,
&ios->layout->comps[i].oc_object_id,
osd_pri_2_pnfs_err(osi.osd_err_pri),
ios->per_dev[i].offset,
@@ -812,7 +811,7 @@ static int _read_done(struct objio_state *ios)
else
status = ret;

- objlayout_read_done(&ios->ol_state, status, ios->sync);
+ objlayout_read_done(&ios->oir, status, ios->sync);
return ret;
}

@@ -906,13 +905,13 @@ static int _write_done(struct objio_state *ios)
if (likely(!ret)) {
/* FIXME: should be based on the OSD's persistence model
* See OSD2r05 Section 4.13 Data persistence model */
- ios->ol_state.committed = NFS_FILE_SYNC;
+ ios->oir.committed = NFS_FILE_SYNC;
status = ios->length;
} else {
status = ret;
}

- objlayout_write_done(&ios->ol_state, status, ios->sync);
+ objlayout_write_done(&ios->oir, status, ios->sync);
return ret;
}

diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index a82053a..72074e3 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -179,16 +179,16 @@ void _fix_verify_io_params(struct pnfs_layout_segment *lseg,
* I/O done common code
*/
static void
-objlayout_iodone(struct objlayout_io_state *state)
+objlayout_iodone(struct objlayout_io_res *oir)
{
- if (likely(state->status >= 0)) {
- objio_free_result(state);
+ if (likely(oir->status >= 0)) {
+ objio_free_result(oir);
} else {
- struct objlayout *objlay = state->objlay;
+ struct objlayout *objlay = oir->objlay;

spin_lock(&objlay->lock);
objlay->delta_space_valid = OBJ_DSU_INVALID;
- list_add(&objlay->err_list, &state->err_list);
+ list_add(&objlay->err_list, &oir->err_list);
spin_unlock(&objlay->lock);
}
}
@@ -200,13 +200,13 @@ objlayout_iodone(struct objlayout_io_state *state)
* the error for later reporting at layout-return.
*/
void
-objlayout_io_set_result(struct objlayout_io_state *state, unsigned index,
+objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index,
struct pnfs_osd_objid *pooid, int osd_error,
u64 offset, u64 length, bool is_write)
{
- struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index];
+ struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[index];

- BUG_ON(index >= state->num_comps);
+ BUG_ON(index >= oir->num_comps);
if (osd_error) {
ioerr->oer_component = *pooid;
ioerr->oer_comp_offset = offset;
@@ -247,15 +247,15 @@ static void _rpc_read_complete(struct work_struct *work)
}

void
-objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync)
+objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
{
- struct nfs_read_data *rdata = state->rpcdata;
+ struct nfs_read_data *rdata = oir->rpcdata;

- state->status = rdata->task.tk_status = status;
+ oir->status = rdata->task.tk_status = status;
if (status >= 0)
rdata->res.count = status;
- objlayout_iodone(state);
- /* must not use state after this point */
+ objlayout_iodone(oir);
+ /* must not use oir after this point */

dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__,
status, rdata->res.eof, sync);
@@ -326,17 +326,16 @@ static void _rpc_write_complete(struct work_struct *work)
}

void
-objlayout_write_done(struct objlayout_io_state *state, ssize_t status,
- bool sync)
+objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
{
- struct nfs_write_data *wdata = state->rpcdata;
+ struct nfs_write_data *wdata = oir->rpcdata;

- state->status = wdata->task.tk_status = status;
+ oir->status = wdata->task.tk_status = status;
if (status >= 0) {
wdata->res.count = status;
- wdata->verf.committed = state->committed;
+ wdata->verf.committed = oir->committed;
}
- objlayout_iodone(state);
+ objlayout_iodone(oir);
/* must not use oir after this point */

dprintk("%s: Return status %zd committed %d sync=%d\n", __func__,
@@ -475,14 +474,14 @@ merge_ioerr(struct pnfs_osd_ioerr *dest_err,
static void
encode_accumulated_error(struct objlayout *objlay, __be32 *p)
{
- struct objlayout_io_state *state, *tmp;
+ struct objlayout_io_res *oir, *tmp;
struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0};

- list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
+ list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) {
unsigned i;

- for (i = 0; i < state->num_comps; i++) {
- struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
+ for (i = 0; i < oir->num_comps; i++) {
+ struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i];

if (!ioerr->oer_errno)
continue;
@@ -501,8 +500,8 @@ encode_accumulated_error(struct objlayout *objlay, __be32 *p)

merge_ioerr(&accumulated_err, ioerr);
}
- list_del(&state->err_list);
- objio_free_result(state);
+ list_del(&oir->err_list);
+ objio_free_result(oir);
}

pnfs_osd_xdr_encode_ioerr(p, &accumulated_err);
@@ -514,7 +513,7 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
const struct nfs4_layoutreturn_args *args)
{
struct objlayout *objlay = OBJLAYOUT(pnfslay);
- struct objlayout_io_state *state, *tmp;
+ struct objlayout_io_res *oir, *tmp;
__be32 *start;

dprintk("%s: Begin\n", __func__);
@@ -523,13 +522,13 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,

spin_lock(&objlay->lock);

- list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
+ list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) {
__be32 *last_xdr = NULL, *p;
unsigned i;
int res = 0;

- for (i = 0; i < state->num_comps; i++) {
- struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
+ for (i = 0; i < oir->num_comps; i++) {
+ struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i];

if (!ioerr->oer_errno)
continue;
@@ -553,7 +552,7 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
}

last_xdr = p;
- pnfs_osd_xdr_encode_ioerr(p, &state->ioerrs[i]);
+ pnfs_osd_xdr_encode_ioerr(p, &oir->ioerrs[i]);
}

/* TODO: use xdr_write_pages */
@@ -569,8 +568,8 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
encode_accumulated_error(objlay, last_xdr);
goto loop_done;
}
- list_del(&state->err_list);
- objio_free_result(state);
+ list_del(&oir->err_list);
+ objio_free_result(oir);
}
loop_done:
spin_unlock(&objlay->lock);
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index d7b2ccfa..81a73ec 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -74,7 +74,7 @@ OBJLAYOUT(struct pnfs_layout_hdr *lo)
* per-I/O operation state
* embedded in objects provider io_state data structure
*/
-struct objlayout_io_state {
+struct objlayout_io_res {
struct objlayout *objlay;

void *rpcdata;
@@ -93,7 +93,7 @@ struct objlayout_io_state {
};

static inline
-void objlayout_init_ioerrs(struct objlayout_io_state *oir, unsigned num_comps,
+void objlayout_init_ioerrs(struct objlayout_io_res *oir, unsigned num_comps,
struct pnfs_osd_ioerr *ioerrs, void *rpcdata,
struct pnfs_layout_hdr *pnfs_layout_type)
{
@@ -114,7 +114,10 @@ extern int objio_alloc_lseg(struct pnfs_layout_segment **outp,
gfp_t gfp_flags);
extern void objio_free_lseg(struct pnfs_layout_segment *lseg);

-extern void objio_free_result(struct objlayout_io_state *state);
+/* objio_free_result will free these @oir structs recieved from
+ * objlayout_{read,write}_done
+ */
+extern void objio_free_result(struct objlayout_io_res *oir);

extern int objio_read_pagelist(struct nfs_read_data *rdata);
extern int objio_write_pagelist(struct nfs_write_data *wdata, int how);
@@ -122,7 +125,7 @@ extern int objio_write_pagelist(struct nfs_write_data *wdata, int how);
/*
* callback API
*/
-extern void objlayout_io_set_result(struct objlayout_io_state *state,
+extern void objlayout_io_set_result(struct objlayout_io_res *oir,
unsigned index, struct pnfs_osd_objid *pooid,
int osd_error, u64 offset, u64 length, bool is_write);

@@ -141,9 +144,9 @@ objlayout_add_delta_space_used(struct objlayout *objlay, s64 space_used)
spin_unlock(&objlay->lock);
}

-extern void objlayout_read_done(struct objlayout_io_state *state,
+extern void objlayout_read_done(struct objlayout_io_res *oir,
ssize_t status, bool sync);
-extern void objlayout_write_done(struct objlayout_io_state *state,
+extern void objlayout_write_done(struct objlayout_io_res *oir,
ssize_t status, bool sync);

extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
--
1.7.2.3


2011-10-04 10:28:22

by Boaz Harrosh

[permalink] [raw]
Subject: [PATCH 01/19] exofs: Rename struct ore_components comps => oc

ore_components already has a comps member so this leads
to things like comps->comps which is annoying. the name oc
was already used in new code. So rename all old usage of
ore_components comps => ore_components oc.

Signed-off-by: Boaz Harrosh <[email protected]>
---
fs/exofs/exofs.h | 16 ++++++------
fs/exofs/inode.c | 22 +++++++++---------
fs/exofs/ore.c | 30 ++++++++++++------------
fs/exofs/super.c | 58 ++++++++++++++++++++++++------------------------
include/scsi/osd_ore.h | 2 +-
5 files changed, 64 insertions(+), 64 deletions(-)

diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index f4e442e..c09d5a7 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -71,7 +71,7 @@ struct exofs_sb_info {
*/
struct ore_layout layout; /* Default files layout */
struct ore_comp one_comp; /* id & cred of partition id=0*/
- struct ore_components comps; /* comps for the partition */
+ struct ore_components oc; /* comps for the partition */
struct osd_dev *_min_one_dev[1]; /* Place holder for one dev */
};

@@ -86,7 +86,7 @@ struct exofs_i_info {
uint32_t i_dir_start_lookup; /* which page to start lookup */
uint64_t i_commit_size; /* the object's written length */
struct ore_comp one_comp; /* same component for all devices */
- struct ore_components comps; /* inode view of the device table */
+ struct ore_components oc; /* inode view of the device table */
};

static inline osd_id exofs_oi_objno(struct exofs_i_info *oi)
@@ -207,7 +207,7 @@ extern const struct inode_operations exofs_fast_symlink_inode_operations;
* bigger and that the device table repeats twice.
* See: exofs_read_lookup_dev_table()
*/
-static inline void exofs_init_comps(struct ore_components *comps,
+static inline void exofs_init_comps(struct ore_components *oc,
struct ore_comp *one_comp,
struct exofs_sb_info *sbi, osd_id oid)
{
@@ -217,13 +217,13 @@ static inline void exofs_init_comps(struct ore_components *comps,
one_comp->obj.id = oid;
exofs_make_credential(one_comp->cred, &one_comp->obj);

- comps->numdevs = sbi->comps.numdevs;
- comps->single_comp = EC_SINGLE_COMP;
- comps->comps = one_comp;
+ oc->numdevs = sbi->oc.numdevs;
+ oc->single_comp = EC_SINGLE_COMP;
+ oc->comps = one_comp;

/* Round robin device view of the table */
- first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->comps.numdevs;
- comps->ods = sbi->comps.ods + first_dev;
+ first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->oc.numdevs;
+ oc->ods = sbi->oc.ods + first_dev;
}

#endif
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index f39a38f..61b2f7e 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -270,7 +270,7 @@ static int read_exec(struct page_collect *pcol)
return 0;

if (!pcol->ios) {
- int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, true,
+ int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, true,
pcol->pg_first << PAGE_CACHE_SHIFT,
pcol->length, &pcol->ios);

@@ -516,7 +516,7 @@ static int write_exec(struct page_collect *pcol)
return 0;

BUG_ON(pcol->ios);
- ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, false,
+ ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false,
pcol->pg_first << PAGE_CACHE_SHIFT,
pcol->length, &pcol->ios);

@@ -860,7 +860,7 @@ static int _do_truncate(struct inode *inode, loff_t newsize)

inode->i_mtime = inode->i_ctime = CURRENT_TIME;

- ret = ore_truncate(&sbi->layout, &oi->comps, (u64)newsize);
+ ret = ore_truncate(&sbi->layout, &oi->oc, (u64)newsize);
if (likely(!ret))
truncate_setsize(inode, newsize);

@@ -927,14 +927,14 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
struct exofs_on_disk_inode_layout *layout;
int ret;

- ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
+ ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
if (unlikely(ret)) {
EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
return ret;
}

- attrs[1].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs);
- attrs[2].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs);
+ attrs[1].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs);
+ attrs[2].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs);

ios->in_attr = attrs;
ios->in_attr_len = ARRAY_SIZE(attrs);
@@ -1018,7 +1018,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
return inode;
oi = exofs_i(inode);
__oi_init(oi);
- exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info,
+ exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info,
exofs_oi_objno(oi));

/* read the inode from the osd */
@@ -1172,13 +1172,13 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
spin_unlock(&sbi->s_next_gen_lock);
insert_inode_hash(inode);

- exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info,
+ exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info,
exofs_oi_objno(oi));
exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */

mark_inode_dirty(inode);

- ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
+ ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
if (unlikely(ret)) {
EXOFS_ERR("exofs_new_inode: ore_get_io_state failed\n");
return ERR_PTR(ret);
@@ -1267,7 +1267,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
} else
memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));

- ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
+ ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
if (unlikely(ret)) {
EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
goto free_args;
@@ -1350,7 +1350,7 @@ void exofs_evict_inode(struct inode *inode)
/* ignore the error, attempt a remove anyway */

/* Now Remove the OSD objects */
- ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
+ ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
if (unlikely(ret)) {
EXOFS_ERR("%s: ore_get_io_state failed\n", __func__);
return;
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 6114fdf..870f85a 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -49,20 +49,20 @@ MODULE_LICENSE("GPL");

static u8 *_ios_cred(struct ore_io_state *ios, unsigned index)
{
- return ios->comps->comps[index & ios->comps->single_comp].cred;
+ return ios->oc->comps[index & ios->oc->single_comp].cred;
}

static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index)
{
- return &ios->comps->comps[index & ios->comps->single_comp].obj;
+ return &ios->oc->comps[index & ios->oc->single_comp].obj;
}

static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index)
{
- return ios->comps->ods[index];
+ return ios->oc->ods[index];
}

-int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps,
+int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
bool is_reading, u64 offset, u64 length,
struct ore_io_state **pios)
{
@@ -71,16 +71,16 @@ int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps,
/*TODO: Maybe use kmem_cach per sbi of size
* exofs_io_state_size(layout->s_numdevs)
*/
- ios = kzalloc(ore_io_state_size(comps->numdevs), GFP_KERNEL);
+ ios = kzalloc(ore_io_state_size(oc->numdevs), GFP_KERNEL);
if (unlikely(!ios)) {
ORE_DBGMSG("Failed kzalloc bytes=%d\n",
- ore_io_state_size(comps->numdevs));
+ ore_io_state_size(oc->numdevs));
*pios = NULL;
return -ENOMEM;
}

ios->layout = layout;
- ios->comps = comps;
+ ios->oc = oc;
ios->offset = offset;
ios->length = length;
ios->reading = is_reading;
@@ -90,10 +90,10 @@ int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps,
}
EXPORT_SYMBOL(ore_get_rw_state);

-int ore_get_io_state(struct ore_layout *layout, struct ore_components *comps,
+int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc,
struct ore_io_state **ios)
{
- return ore_get_rw_state(layout, comps, true, 0, 0, ios);
+ return ore_get_rw_state(layout, oc, true, 0, 0, ios);
}
EXPORT_SYMBOL(ore_get_io_state);

@@ -476,7 +476,7 @@ int ore_create(struct ore_io_state *ios)
{
int i, ret;

- for (i = 0; i < ios->comps->numdevs; i++) {
+ for (i = 0; i < ios->oc->numdevs; i++) {
struct osd_request *or;

or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
@@ -501,7 +501,7 @@ int ore_remove(struct ore_io_state *ios)
{
int i, ret;

- for (i = 0; i < ios->comps->numdevs; i++) {
+ for (i = 0; i < ios->oc->numdevs; i++) {
struct osd_request *or;

or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
@@ -768,7 +768,7 @@ static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
ti->max_devs = layout->group_width * layout->group_count;
}

-int ore_truncate(struct ore_layout *layout, struct ore_components *comps,
+int ore_truncate(struct ore_layout *layout, struct ore_components *oc,
u64 size)
{
struct ore_io_state *ios;
@@ -779,7 +779,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *comps,
struct _trunc_info ti;
int i, ret;

- ret = ore_get_io_state(layout, comps, &ios);
+ ret = ore_get_io_state(layout, oc, &ios);
if (unlikely(ret))
return ret;

@@ -792,7 +792,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *comps,
goto out;
}

- ios->numdevs = ios->comps->numdevs;
+ ios->numdevs = ios->oc->numdevs;

for (i = 0; i < ti.max_devs; ++i) {
struct exofs_trunc_attr *size_attr = &size_attrs[i];
@@ -815,7 +815,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *comps,
size_attr->attr.val_ptr = &size_attr->newsize;

ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n",
- _LLU(comps->comps->obj.id), _LLU(obj_size), i);
+ _LLU(oc->comps->obj.id), _LLU(obj_size), i);
ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1,
&size_attr->attr);
if (unlikely(ret))
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index afe79ee..babb195 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -266,7 +266,7 @@ static int __sbi_read_stats(struct exofs_sb_info *sbi)
struct ore_io_state *ios;
int ret;

- ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios);
+ ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
if (unlikely(ret)) {
EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
return ret;
@@ -321,7 +321,7 @@ int exofs_sbi_write_stats(struct exofs_sb_info *sbi)
struct ore_io_state *ios;
int ret;

- ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios);
+ ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
if (unlikely(ret)) {
EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
return ret;
@@ -360,7 +360,7 @@ static int exofs_sync_fs(struct super_block *sb, int wait)
struct exofs_sb_info *sbi;
struct exofs_fscb *fscb;
struct ore_comp one_comp;
- struct ore_components comps;
+ struct ore_components oc;
struct ore_io_state *ios;
int ret = -ENOMEM;

@@ -378,9 +378,9 @@ static int exofs_sync_fs(struct super_block *sb, int wait)
* the writeable info is set in exofs_sbi_write_stats() above.
*/

- exofs_init_comps(&comps, &one_comp, sbi, EXOFS_SUPER_ID);
+ exofs_init_comps(&oc, &one_comp, sbi, EXOFS_SUPER_ID);

- ret = ore_get_io_state(&sbi->layout, &comps, &ios);
+ ret = ore_get_io_state(&sbi->layout, &oc, &ios);
if (unlikely(ret))
goto out;

@@ -431,17 +431,17 @@ static void _exofs_print_device(const char *msg, const char *dev_path,

static void exofs_free_sbi(struct exofs_sb_info *sbi)
{
- while (sbi->comps.numdevs) {
- int i = --sbi->comps.numdevs;
- struct osd_dev *od = sbi->comps.ods[i];
+ while (sbi->oc.numdevs) {
+ int i = --sbi->oc.numdevs;
+ struct osd_dev *od = sbi->oc.ods[i];

if (od) {
- sbi->comps.ods[i] = NULL;
+ sbi->oc.ods[i] = NULL;
osduld_put_device(od);
}
}
- if (sbi->comps.ods != sbi->_min_one_dev)
- kfree(sbi->comps.ods);
+ if (sbi->oc.ods != sbi->_min_one_dev)
+ kfree(sbi->oc.ods);
kfree(sbi);
}

@@ -468,7 +468,7 @@ static void exofs_put_super(struct super_block *sb)
msecs_to_jiffies(100));
}

- _exofs_print_device("Unmounting", NULL, sbi->comps.ods[0],
+ _exofs_print_device("Unmounting", NULL, sbi->oc.ods[0],
sbi->one_comp.obj.partition);

bdi_destroy(&sbi->bdi);
@@ -623,7 +623,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
return -ENOMEM;
}

- sbi->comps.numdevs = 0;
+ sbi->oc.numdevs = 0;

comp.obj.partition = sbi->one_comp.obj.partition;
comp.obj.id = EXOFS_DEVTABLE_ID;
@@ -648,13 +648,13 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
goto out;

if (likely(numdevs > 1)) {
- unsigned size = numdevs * sizeof(sbi->comps.ods[0]);
+ unsigned size = numdevs * sizeof(sbi->oc.ods[0]);

/* Twice bigger table: See exofs_init_comps() and below
* comment
*/
- sbi->comps.ods = kzalloc(size + size - 1, GFP_KERNEL);
- if (unlikely(!sbi->comps.ods)) {
+ sbi->oc.ods = kzalloc(size + size - 1, GFP_KERNEL);
+ if (unlikely(!sbi->oc.ods)) {
EXOFS_ERR("ERROR: faild allocating Device array[%d]\n",
numdevs);
ret = -ENOMEM;
@@ -681,8 +681,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
* line. We always keep them in device-table order.
*/
if (fscb_od && osduld_device_same(fscb_od, &odi)) {
- sbi->comps.ods[i] = fscb_od;
- ++sbi->comps.numdevs;
+ sbi->oc.ods[i] = fscb_od;
+ ++sbi->oc.numdevs;
fscb_od = NULL;
continue;
}
@@ -695,8 +695,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
goto out;
}

- sbi->comps.ods[i] = od;
- ++sbi->comps.numdevs;
+ sbi->oc.ods[i] = od;
+ ++sbi->oc.numdevs;

/* Read the fscb of the other devices to make sure the FS
* partition is there.
@@ -719,7 +719,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
out:
kfree(dt);
if (likely(!ret)) {
- unsigned numdevs = sbi->comps.numdevs;
+ unsigned numdevs = sbi->oc.numdevs;

if (unlikely(fscb_od)) {
EXOFS_ERR("ERROR: Bad device-table container device not present\n");
@@ -732,7 +732,7 @@ out:
* starting at this device. See exofs_init_comps()
*/
for (i = 0; i < numdevs - 1; ++i)
- sbi->comps.ods[i + numdevs] = sbi->comps.ods[i];
+ sbi->oc.ods[i + numdevs] = sbi->oc.ods[i];
}
return ret;
}
@@ -783,10 +783,10 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
sbi->one_comp.obj.partition = opts->pid;
sbi->one_comp.obj.id = 0;
exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj);
- sbi->comps.numdevs = 1;
- sbi->comps.single_comp = EC_SINGLE_COMP;
- sbi->comps.comps = &sbi->one_comp;
- sbi->comps.ods = sbi->_min_one_dev;
+ sbi->oc.numdevs = 1;
+ sbi->oc.single_comp = EC_SINGLE_COMP;
+ sbi->oc.comps = &sbi->one_comp;
+ sbi->oc.ods = sbi->_min_one_dev;

/* fill in some other data by hand */
memset(sb->s_id, 0, sizeof(sb->s_id));
@@ -835,7 +835,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
if (unlikely(ret))
goto free_sbi;
} else {
- sbi->comps.ods[0] = od;
+ sbi->oc.ods[0] = od;
}

__sbi_read_stats(sbi);
@@ -875,7 +875,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
goto free_sbi;
}

- _exofs_print_device("Mounting", opts->dev_name, sbi->comps.ods[0],
+ _exofs_print_device("Mounting", opts->dev_name, sbi->oc.ods[0],
sbi->one_comp.obj.partition);
return 0;

@@ -924,7 +924,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
uint64_t used = ULLONG_MAX;
int ret;

- ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios);
+ ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
if (ret) {
EXOFS_DBGMSG("ore_get_io_state failed.\n");
return ret;
diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h
index c5c5e00..954292a 100644
--- a/include/scsi/osd_ore.h
+++ b/include/scsi/osd_ore.h
@@ -64,7 +64,7 @@ struct ore_io_state {
ore_io_done_fn done;

struct ore_layout *layout;
- struct ore_components *comps;
+ struct ore_components *oc;

/* Global read/write IO*/
loff_t offset;
--
1.7.2.3


2011-10-04 10:34:47

by Boaz Harrosh

[permalink] [raw]
Subject: [PATCH 12/19] ore/exofs: Change ore_check_io API

Current ore_check_io API receives a residual
pointer, to report partial IO. But it is actually
not used, because in a multiple devices IO there
is never a linearity in the IO failure.

On the other hand if every failing device is reported
through a received callback measures can be taken to
handle only failed devices. One at a time.

This will also be needed by the objects-layout-driver
for it's error reporting facility.

Exofs is not currently using the new information and
keeps the old behaviour of failing the complete IO in
case of an error. (No partial completion)

TODO: Use an ore_check_io callback to set_page_error only
the failing pages. And re-dirty write pages.

Signed-off-by: Boaz Harrosh <[email protected]>
---
fs/exofs/inode.c | 14 ++++----------
fs/exofs/ore.c | 29 ++++++++++++++++-------------
include/scsi/osd_ore.h | 5 ++++-
3 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 7b1a4ba..69dc236 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -194,19 +194,16 @@ static void update_write_page(struct page *page, int ret)
static int __readpages_done(struct page_collect *pcol)
{
int i;
- u64 resid;
u64 good_bytes;
u64 length = 0;
- int ret = ore_check_io(pcol->ios, &resid);
+ int ret = ore_check_io(pcol->ios, NULL);

if (likely(!ret)) {
good_bytes = pcol->length;
ret = PAGE_WAS_NOT_IN_IO;
} else {
- good_bytes = pcol->length - resid;
+ good_bytes = 0;
}
- if (good_bytes > pcol->ios->length)
- good_bytes = pcol->ios->length;

EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx"
" length=0x%lx nr_pages=%u\n",
@@ -519,10 +516,9 @@ static void writepages_done(struct ore_io_state *ios, void *p)
{
struct page_collect *pcol = p;
int i;
- u64 resid;
u64 good_bytes;
u64 length = 0;
- int ret = ore_check_io(ios, &resid);
+ int ret = ore_check_io(ios, NULL);

atomic_dec(&pcol->sbi->s_curr_pending);

@@ -530,10 +526,8 @@ static void writepages_done(struct ore_io_state *ios, void *p)
good_bytes = pcol->length;
ret = PAGE_WAS_NOT_IN_IO;
} else {
- good_bytes = pcol->length - resid;
+ good_bytes = 0;
}
- if (good_bytes > pcol->ios->length)
- good_bytes = pcol->ios->length;

EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx"
" length=0x%lx nr_pages=%u\n",
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 14a9100..0dafd50 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -317,7 +317,7 @@ static void _clear_bio(struct bio *bio)
}
}

-int ore_check_io(struct ore_io_state *ios, u64 *resid)
+int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error)
{
enum osd_err_priority acumulated_osd_err = 0;
int acumulated_lin_err = 0;
@@ -325,7 +325,8 @@ int ore_check_io(struct ore_io_state *ios, u64 *resid)

for (i = 0; i < ios->numdevs; i++) {
struct osd_sense_info osi;
- struct osd_request *or = ios->per_dev[i].or;
+ struct ore_per_dev_state *per_dev = &ios->per_dev[i];
+ struct osd_request *or = per_dev->or;
int ret;

if (unlikely(!or))
@@ -337,29 +338,31 @@ int ore_check_io(struct ore_io_state *ios, u64 *resid)

if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
/* start read offset passed endof file */
- _clear_bio(ios->per_dev[i].bio);
+ _clear_bio(per_dev->bio);
ORE_DBGMSG("start read offset passed end of file "
"offset=0x%llx, length=0x%llx\n",
- _LLU(ios->per_dev[i].offset),
- _LLU(ios->per_dev[i].length));
+ _LLU(per_dev->offset),
+ _LLU(per_dev->length));

continue; /* we recovered */
}

+ if (on_dev_error) {
+ u64 residual = ios->reading ?
+ or->in.residual : or->out.residual;
+ u64 offset = (ios->offset + ios->length) - residual;
+ struct ore_dev *od = ios->oc->ods[
+ per_dev->dev - ios->oc->first_dev];
+
+ on_dev_error(ios, od, per_dev->dev, osi.osd_err_pri,
+ offset, residual);
+ }
if (osi.osd_err_pri >= acumulated_osd_err) {
acumulated_osd_err = osi.osd_err_pri;
acumulated_lin_err = ret;
}
}

- /* TODO: raid specific residual calculations */
- if (resid) {
- if (likely(!acumulated_lin_err))
- *resid = 0;
- else
- *resid = ios->length;
- }
-
return acumulated_lin_err;
}
EXPORT_SYMBOL(ore_check_io);
diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h
index 716dbea..af2231a 100644
--- a/include/scsi/osd_ore.h
+++ b/include/scsi/osd_ore.h
@@ -153,7 +153,10 @@ int ore_get_io_state(struct ore_layout *layout, struct ore_components *comps,
struct ore_io_state **ios);
void ore_put_io_state(struct ore_io_state *ios);

-int ore_check_io(struct ore_io_state *ios, u64 *resid);
+typedef void (*ore_on_dev_error)(struct ore_io_state *ios, struct ore_dev *od,
+ unsigned dev_index, enum osd_err_priority oep,
+ u64 dev_offset, u64 dev_len);
+int ore_check_io(struct ore_io_state *ios, ore_on_dev_error rep);

int ore_create(struct ore_io_state *ios);
int ore_remove(struct ore_io_state *ios);
--
1.7.2.3


2011-10-04 10:35:09

by Boaz Harrosh

[permalink] [raw]
Subject: [PATCH 13/19] pnfs-obj: Remove redundant EOF from objlayout_io_state

The EOF calculation was done on .read_pagelist(), cached
in objlayout_io_state->eof, and set in objlayout_read_done()
into nfs_read_data->res.eof.

So set it directly into nfs_read_data->res.eof and avoid
the extra member.

This is a slight behaviour change because before eof was
*not* set on an error update at objlayout_read_done(). But
is that a problem? Is Generic layer so sensitive that it
will miss the error IO if eof was set? From my testing
I did not see such a problem.

Benny please review.

Which brings me to a more abstract problem. Why does the
LAYOUT driver needs to do this eof calculation? .i.e we
are inspecting generic i_size_read() and if spanned by
offset + count which is received from generic layer we set
eof. It looks like all this can/should be done in generic
layer and not at LD. Where does NFS and files-LD do it?
It looks like it can be promoted.

Signed-off-by: Boaz Harrosh <[email protected]>
---
fs/nfs/objlayout/objlayout.c | 16 +++++++---------
fs/nfs/objlayout/objlayout.h | 1 -
2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 1d06f8e..1300736 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -287,17 +287,14 @@ static void _rpc_read_complete(struct work_struct *work)
void
objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync)
{
- int eof = state->eof;
- struct nfs_read_data *rdata;
+ struct nfs_read_data *rdata = state->rpcdata;

state->status = status;
- dprintk("%s: Begin status=%zd eof=%d\n", __func__, status, eof);
- rdata = state->rpcdata;
+ dprintk("%s: Begin status=%zd eof=%d\n", __func__,
+ status, rdata->res.eof);
rdata->task.tk_status = status;
- if (status >= 0) {
+ if (status >= 0)
rdata->res.count = status;
- rdata->res.eof = eof;
- }
objlayout_iodone(state);
/* must not use state after this point */

@@ -330,11 +327,14 @@ objlayout_read_pagelist(struct nfs_read_data *rdata)
status = 0;
rdata->res.count = 0;
rdata->res.eof = 1;
+ /*FIXME: do we need to call pnfs_ld_read_done() */
goto out;
}
count = eof - offset;
}

+ rdata->res.eof = (offset + count) >= eof;
+
state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout,
rdata->args.pages, rdata->args.pgbase,
offset, count,
@@ -345,8 +345,6 @@ objlayout_read_pagelist(struct nfs_read_data *rdata)
goto out;
}

- state->eof = state->offset + state->count >= eof;
-
status = objio_read_pagelist(state);
out:
dprintk("%s: Return status %Zd\n", __func__, status);
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index a8244c8..ffb884c 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -86,7 +86,6 @@ struct objlayout_io_state {

void *rpcdata;
int status; /* res */
- int eof; /* res */
int committed; /* res */

/* Error reporting (layout_return) */
--
1.7.2.3


2011-10-04 10:37:24

by Boaz Harrosh

[permalink] [raw]
Subject: [PATCH 19/19] pnfs-obj: move to ore 03: Remove old raid engine

Finally remove all the old raid engine, which is by now
dead code.

Signed-off-by: Boaz Harrosh <[email protected]>
---
fs/nfs/objlayout/objio_osd.c | 504 ------------------------------------------
1 files changed, 0 insertions(+), 504 deletions(-)

diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 00b3849..3161da6 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -188,48 +188,6 @@ out:
return err;
}

-#if 0
-static int _verify_data_map(struct pnfs_osd_layout *layout)
-{
- struct pnfs_osd_data_map *data_map = &layout->olo_map;
- u64 stripe_length;
- u32 group_width;
-
-/* FIXME: Only raid0 for now. if not go through MDS */
- if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) {
- printk(KERN_ERR "Only RAID_0 for now\n");
- return -ENOTSUPP;
- }
- if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) {
- printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n",
- data_map->odm_num_comps, data_map->odm_mirror_cnt);
- return -EINVAL;
- }
-
- if (data_map->odm_group_width)
- group_width = data_map->odm_group_width;
- else
- group_width = data_map->odm_num_comps /
- (data_map->odm_mirror_cnt + 1);
-
- stripe_length = (u64)data_map->odm_stripe_unit * group_width;
- if (stripe_length >= (1ULL << 32)) {
- printk(KERN_ERR "Total Stripe length(0x%llx)"
- " >= 32bit is not supported\n", _LLU(stripe_length));
- return -ENOTSUPP;
- }
-
- if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) {
- printk(KERN_ERR "Stripe Unit(0x%llx)"
- " must be Multples of PAGE_SIZE(0x%lx)\n",
- _LLU(data_map->odm_stripe_unit), PAGE_SIZE);
- return -ENOTSUPP;
- }
-
- return 0;
-}
-#endif
-
static void copy_single_comp(struct ore_components *oc, unsigned c,
struct pnfs_osd_object_cred *src_comp)
{
@@ -441,327 +399,6 @@ static void __on_dev_error(struct ore_io_state *ios,
dev_offset, dev_len, !ios->reading);
}

-#if 0
-static void _clear_bio(struct bio *bio)
-{
- struct bio_vec *bv;
- unsigned i;
-
- __bio_for_each_segment(bv, bio, i, 0) {
- unsigned this_count = bv->bv_len;
-
- if (likely(PAGE_SIZE == this_count))
- clear_highpage(bv->bv_page);
- else
- zero_user(bv->bv_page, bv->bv_offset, this_count);
- }
-}
-
-static int _io_check(struct objio_state *ios, bool is_write)
-{
- enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR;
- int lin_ret = 0;
- int i;
-
- for (i = 0; i < ios->numdevs; i++) {
- struct osd_sense_info osi;
- struct osd_request *or = ios->per_dev[i].or;
- int ret;
-
- if (!or)
- continue;
-
- ret = osd_req_decode_sense(or, &osi);
- if (likely(!ret))
- continue;
-
- if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
- /* start read offset passed endof file */
- BUG_ON(is_write);
- _clear_bio(ios->per_dev[i].bio);
- dprintk("%s: start read offset passed end of file "
- "offset=0x%llx, length=0x%lx\n", __func__,
- _LLU(ios->per_dev[i].offset),
- ios->per_dev[i].length);
-
- continue; /* we recovered */
- }
- __on_dev_error(ios, is_write, ios->oc->ods[i],
- ios->per_dev[i].dev, osi.osd_err_pri,
- ios->per_dev[i].offset, ios->per_dev[i].length);
-
- if (osi.osd_err_pri >= oep) {
- oep = osi.osd_err_pri;
- lin_ret = ret;
- }
- }
-
- return lin_ret;
-}
-
-/*
- * Common IO state helpers.
- */
-static void _io_free(struct objio_state *ios)
-{
- unsigned i;
-
- for (i = 0; i < ios->numdevs; i++) {
- struct _objio_per_comp *per_dev = &ios->per_dev[i];
-
- if (per_dev->or) {
- osd_end_request(per_dev->or);
- per_dev->or = NULL;
- }
-
- if (per_dev->bio) {
- bio_put(per_dev->bio);
- per_dev->bio = NULL;
- }
- }
-}
-
-struct osd_dev *_io_od(struct objio_state *ios, unsigned dev)
-{
- unsigned min_dev = ios->oc->first_dev;
- unsigned max_dev = min_dev + ios->oc->numdevs;
-
- BUG_ON(dev < min_dev || max_dev <= dev);
- return ios->oc->ods[dev - min_dev]->od;
-}
-
-struct _striping_info {
- u64 obj_offset;
- u64 group_length;
- unsigned dev;
- unsigned unit_off;
-};
-
-static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
- struct _striping_info *si)
-{
- u32 stripe_unit = ios->layout->stripe_unit;
- u32 group_width = ios->layout->group_width;
- u64 group_depth = ios->layout->group_depth;
- u32 U = stripe_unit * group_width;
-
- u64 T = U * group_depth;
- u64 S = T * ios->layout->group_count;
- u64 M = div64_u64(file_offset, S);
-
- /*
- G = (L - (M * S)) / T
- H = (L - (M * S)) % T
- */
- u64 LmodU = file_offset - M * S;
- u32 G = div64_u64(LmodU, T);
- u64 H = LmodU - G * T;
-
- u32 N = div_u64(H, U);
-
- div_u64_rem(file_offset, stripe_unit, &si->unit_off);
- si->obj_offset = si->unit_off + (N * stripe_unit) +
- (M * group_depth * stripe_unit);
-
- /* "H - (N * U)" is just "H % U" so it's bound to u32 */
- si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
- si->dev *= ios->layout->mirrors_p1;
-
- si->group_length = T - H;
-}
-
-static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg,
- unsigned pgbase, struct _objio_per_comp *per_dev, int len,
- gfp_t gfp_flags)
-{
- unsigned pg = *cur_pg;
- int cur_len = len;
- struct request_queue *q =
- osd_request_queue(_io_od(ios, per_dev->dev));
-
- if (per_dev->bio == NULL) {
- unsigned pages_in_stripe = ios->layout->group_width *
- (ios->layout->stripe_unit / PAGE_SIZE);
- unsigned bio_size = (ios->nr_pages + pages_in_stripe) /
- ios->layout->group_width;
-
- if (BIO_MAX_PAGES_KMALLOC < bio_size)
- bio_size = BIO_MAX_PAGES_KMALLOC;
-
- per_dev->bio = bio_kmalloc(gfp_flags, bio_size);
- if (unlikely(!per_dev->bio)) {
- dprintk("Faild to allocate BIO size=%u\n", bio_size);
- return -ENOMEM;
- }
- }
-
- while (cur_len > 0) {
- unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
- unsigned added_len;
-
- BUG_ON(ios->nr_pages <= pg);
- cur_len -= pglen;
-
- added_len = bio_add_pc_page(q, per_dev->bio,
- ios->pages[pg], pglen, pgbase);
- if (unlikely(pglen != added_len))
- return -ENOMEM;
- pgbase = 0;
- ++pg;
- }
- BUG_ON(cur_len);
-
- per_dev->length += len;
- *cur_pg = pg;
- return 0;
-}
-
-static int _prepare_one_group(struct objio_state *ios, u64 length,
- struct _striping_info *si, unsigned *last_pg,
- gfp_t gfp_flags)
-{
- unsigned stripe_unit = ios->layout->stripe_unit;
- unsigned mirrors_p1 = ios->layout->mirrors_p1;
- unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
- unsigned dev = si->dev;
- unsigned first_dev = dev - (dev % devs_in_group);
- unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
- unsigned cur_pg = *last_pg;
- int ret = 0;
-
- while (length) {
- struct _objio_per_comp *per_dev = &ios->per_dev[dev - first_dev];
- unsigned cur_len, page_off = 0;
-
- if (!per_dev->length) {
- per_dev->dev = dev;
- if (dev < si->dev) {
- per_dev->offset = si->obj_offset + stripe_unit -
- si->unit_off;
- cur_len = stripe_unit;
- } else if (dev == si->dev) {
- per_dev->offset = si->obj_offset;
- cur_len = stripe_unit - si->unit_off;
- page_off = si->unit_off & ~PAGE_MASK;
- BUG_ON(page_off &&
- (page_off != ios->pgbase));
- } else { /* dev > si->dev */
- per_dev->offset = si->obj_offset - si->unit_off;
- cur_len = stripe_unit;
- }
-
- if (max_comp < dev - first_dev)
- max_comp = dev - first_dev;
- } else {
- cur_len = stripe_unit;
- }
- if (cur_len >= length)
- cur_len = length;
-
- ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
- cur_len, gfp_flags);
- if (unlikely(ret))
- goto out;
-
- dev += mirrors_p1;
- dev = (dev % devs_in_group) + first_dev;
-
- length -= cur_len;
- ios->length += cur_len;
- }
-out:
- ios->numdevs = max_comp + mirrors_p1;
- *last_pg = cur_pg;
- return ret;
-}
-
-static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags)
-{
- u64 length = ios->count;
- u64 offset = ios->offset;
- struct _striping_info si;
- unsigned last_pg = 0;
- int ret = 0;
-
- while (length) {
- _calc_stripe_info(ios, offset, &si);
-
- if (length < si.group_length)
- si.group_length = length;
-
- ret = _prepare_one_group(ios, si.group_length, &si, &last_pg, gfp_flags);
- if (unlikely(ret))
- goto out;
-
- offset += si.group_length;
- length -= si.group_length;
- }
-
-out:
- if (!ios->length)
- return ret;
-
- return 0;
-}
-
-static int _sync_done(struct objio_state *ios)
-{
- struct completion *waiting = ios->private;
-
- complete(waiting);
- return 0;
-}
-
-static void _last_io(struct kref *kref)
-{
- struct objio_state *ios = container_of(kref, struct objio_state, kref);
-
- ios->done(ios);
-}
-
-static void _done_io(struct osd_request *or, void *p)
-{
- struct objio_state *ios = p;
-
- kref_put(&ios->kref, _last_io);
-}
-
-static int _io_exec(struct objio_state *ios)
-{
- DECLARE_COMPLETION_ONSTACK(wait);
- int ret = 0;
- unsigned i;
- objio_done_fn saved_done_fn = ios->done;
- bool sync = ios->sync;
-
- if (sync) {
- ios->done = _sync_done;
- ios->private = &wait;
- }
-
- kref_init(&ios->kref);
-
- for (i = 0; i < ios->numdevs; i++) {
- struct osd_request *or = ios->per_dev[i].or;
-
- if (!or)
- continue;
-
- kref_get(&ios->kref);
- osd_execute_request_async(or, _done_io, ios);
- }
-
- kref_put(&ios->kref, _last_io);
-
- if (sync) {
- wait_for_completion(&wait);
- ret = saved_done_fn(ios);
- }
-
- return ret;
-}
-#endif
-
/*
* read
*/
@@ -781,63 +418,6 @@ static void _read_done(struct ore_io_state *ios, void *private)
objlayout_read_done(&objios->oir, status, objios->sync);
}

-#if 0
-static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
-{
- struct osd_request *or = NULL;
- struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
- unsigned dev = per_dev->dev;
- struct ore_comp *cred =
- &ios->oc->comps[cur_comp];
- struct osd_obj_id obj = cred->obj;
- int ret;
-
- or = osd_start_request(_io_od(ios, dev), GFP_KERNEL);
- if (unlikely(!or)) {
- ret = -ENOMEM;
- goto err;
- }
- per_dev->or = or;
-
- osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length);
-
- ret = osd_finalize_request(or, 0, cred->cred, NULL);
- if (ret) {
- dprintk("%s: Faild to osd_finalize_request() => %d\n",
- __func__, ret);
- goto err;
- }
-
- dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
- __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
- per_dev->length);
-
-err:
- return ret;
-}
-
-static int _read_exec(struct objio_state *ios)
-{
- unsigned i;
- int ret;
-
- for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
- if (!ios->per_dev[i].length)
- continue;
- ret = _read_mirrors(ios, i);
- if (unlikely(ret))
- goto err;
- }
-
- ios->done = _read_done;
- return _io_exec(ios);
-
-err:
- _io_free(ios);
- return ret;
-}
-#endif
-
int objio_read_pagelist(struct nfs_read_data *rdata)
{
struct objio_state *objios;
@@ -879,90 +459,6 @@ static void _write_done(struct ore_io_state *ios, void *private)
objlayout_write_done(&objios->oir, status, objios->sync);
}

-#if 0
-static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
-{
- struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp];
- unsigned dev = ios->per_dev[cur_comp].dev;
- unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
- int ret;
-
- for (; cur_comp < last_comp; ++cur_comp, ++dev) {
- struct osd_request *or = NULL;
- struct ore_comp *cred = &ios->oc->comps[cur_comp];
- struct osd_obj_id obj = cred->obj;
- struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
- struct bio *bio;
-
- or = osd_start_request(_io_od(ios, dev), GFP_NOFS);
- if (unlikely(!or)) {
- ret = -ENOMEM;
- goto err;
- }
- per_dev->or = or;
-
- if (per_dev != master_dev) {
- bio = bio_kmalloc(GFP_NOFS,
- master_dev->bio->bi_max_vecs);
- if (unlikely(!bio)) {
- dprintk("Faild to allocate BIO size=%u\n",
- master_dev->bio->bi_max_vecs);
- ret = -ENOMEM;
- goto err;
- }
-
- __bio_clone(bio, master_dev->bio);
- bio->bi_bdev = NULL;
- bio->bi_next = NULL;
- per_dev->bio = bio;
- per_dev->dev = dev;
- per_dev->length = master_dev->length;
- per_dev->offset = master_dev->offset;
- } else {
- bio = master_dev->bio;
- bio->bi_rw |= REQ_WRITE;
- }
-
- osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length);
-
- ret = osd_finalize_request(or, 0, cred->cred, NULL);
- if (ret) {
- dprintk("%s: Faild to osd_finalize_request() => %d\n",
- __func__, ret);
- goto err;
- }
-
- dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
- __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
- per_dev->length);
- }
-
-err:
- return ret;
-}
-
-static int _write_exec(struct objio_state *ios)
-{
- unsigned i;
- int ret;
-
- for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
- if (!ios->per_dev[i].length)
- continue;
- ret = _write_mirrors(ios, i);
- if (unlikely(ret))
- goto err;
- }
-
- ios->done = _write_done;
- return _io_exec(ios);
-
-err:
- _io_free(ios);
- return ret;
-}
-#endif
-
int objio_write_pagelist(struct nfs_write_data *wdata, int how)
{
struct objio_state *objios;
--
1.7.2.3


2011-10-04 12:04:09

by Benny Halevy

[permalink] [raw]
Subject: Re: [PATCHSET 00/19] objlayout: Move to ORE

Boaz, please change my email address in your contacts lists
to [email protected], thanks...

On 2011-10-04 12:24, Boaz Harrosh wrote:
>
> Submitted is the move of the objects-layout-driver to the ORE
> (Objects Raid Engine). Which after this patchset will be
> used by both the exofs file system and the objlayoutdriver.
> (ore.ko is its own library since last Kernel)
>
> This code is intended for the 3.2 Kernel and is already
> collecting dust in linux-next. (Though the latest bits from today)
>
> End of this week I will post the RAID5 support to ORE and both
> exofs and objlayoutdriver. Also meant for 3.2
>
> Trond Hi
> The First 12 patches are to the ore and exofs to make them
> compatible with objlayoutdriver. The pnfs-obj patches are
> dependent on the first part been present.
> We can either submit them all through your tree. Or alternatively
> You can send your ACK-by: on the last 7 and I can submit them
> to Linus through my tree. Which ever you prefer.
>
> Benny Hi
> I please need your Review-by: to [PATCH 13/19] and [PATCH 14/19]
> which change code behaviour. The rest of the patches are just conversions
> which in theory do not add or change any functionality. (Unless there
> are bugs, but that's not intended)

Sure

>
> Also, With these patches panfs-layout-driver is totally broken.
> Please just remove it once you rebase on these patches. With the RAID5
> support it is no longer needed. I've started testing with PanFS export
> through the STD objlayoutdriver, hope to finish this week. (We always
> have the old versions)

No problem, less is more :)

Benny

>
> Also tomorrow I will send the needed patch for pnfsd-exofs branch
> that works with these changes.
>
> These are the list off patches:
>
> [PATCH 01/19] exofs: Rename struct ore_components comps => oc
> [PATCH 02/19] exofs: Remove unused data_map member from exofs_sb_info
> [PATCH 03/19] ore: Make ore_striping_info and ore_calc_stripe_info public
> [PATCH 04/19] ore/exofs: Change the type of the devices array (API change)
> [PATCH 05/19] ore: Only IO one group at a time (API change)
> [PATCH 06/19] ore: cleanup: Embed an ore_striping_info inside ore_io_state
> [PATCH 07/19] ore: Remove check for ios->kern_buff in _prepare_for_striping to later
> [PATCH 08/19] exofs: Support for short read/writes
> [PATCH 09/19] ore: Support for short read/writes
> [PATCH 10/19] ore: Support for partial component table
> [PATCH 11/19] ore/exofs: Define new ore_verify_layout
> [PATCH 12/19] ore/exofs: Change ore_check_io API
>
> Up to here are the changes need to ore and exofs so the ore
> can be used by the objlayoutdriver. Any review is welcome.
> Same API will be used for RAID4/5/6 support.
>
> [PATCH 13/19] pnfs-obj: Remove redundant EOF from objlayout_io_state
> [PATCH 14/19] pnfs-obj: Return PNFS_NOT_ATTEMPTED in case of read/write_pagelist
>
> Benny please review these two. They are independent of the ORE
> conversion. I think the [PATCH 14/19] might not be enough and
> Error handling needs "more", but the needed changes are their
> own patch, to come later.
>
> [PATCH 15/19] pnfs-obj: Get rid of objlayout_{alloc,free}_io_state
> [PATCH 16/19] pnfs-obj: Rename objlayout_io_state => objlayout_io_res
> [PATCH 17/19] pnfs-obj: move to ore 01: ore_layout & ore_components
> [PATCH 18/19] pnfs-obj: move to ore 02: move to ORE
> [PATCH 19/19] pnfs-obj: move to ore 03: Remove old raid engine
>
> These 5 stage the move to the ore. With these patches I'm
> able to pass all the tests I passed with the old code.
> Only now with more then 500 lines of code less.
>
> Cheers
> Boaz
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

--
Benny Halevy
CTO, Tonian Inc.

Tel: +972-54-802-8340
[email protected]

2011-10-04 10:31:05

by Boaz Harrosh

[permalink] [raw]
Subject: [PATCH 06/19] ore: cleanup: Embed an ore_striping_info inside ore_io_state

Now that each ore_io_state covers only a single raid group.
A single striping_info math is needed. Embed one inside
ore_io_state to cache the calculation results and eliminate
an extra call.

Also the outer _prepare_for_striping is removed since it does nothing.

Signed-off-by: Boaz Harrosh <[email protected]>
---
fs/exofs/ore.c | 61 +++++++++++++++++++-----------------------------
include/scsi/osd_ore.h | 1 +
2 files changed, 25 insertions(+), 37 deletions(-)

diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index d54af35..43a53ba 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -121,11 +121,9 @@ int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
ios->offset = offset;

if (length) {
- struct ore_striping_info si;
-
- ore_calc_stripe_info(layout, offset, &si);
- ios->length = (length <= si.group_length) ? length :
- si.group_length;
+ ore_calc_stripe_info(layout, offset, &ios->si);
+ ios->length = (length <= ios->si.group_length) ? length :
+ ios->si.group_length;
ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE;
}

@@ -416,9 +414,9 @@ static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
return 0;
}

-static int _prepare_one_group(struct ore_io_state *ios, u64 length,
- struct ore_striping_info *si)
+static int _prepare_for_striping(struct ore_io_state *ios)
{
+ struct ore_striping_info *si = &ios->si;
unsigned stripe_unit = ios->layout->stripe_unit;
unsigned mirrors_p1 = ios->layout->mirrors_p1;
unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
@@ -426,8 +424,27 @@ static int _prepare_one_group(struct ore_io_state *ios, u64 length,
unsigned first_dev = dev - (dev % devs_in_group);
unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
unsigned cur_pg = ios->pages_consumed;
+ u64 length = ios->length;
int ret = 0;

+ if (!ios->pages) {
+ if (ios->kern_buff) {
+ struct ore_per_dev_state *per_dev = &ios->per_dev[0];
+
+ per_dev->offset = si->obj_offset;
+ per_dev->dev = si->dev;
+
+ /* no cross device without page array */
+ BUG_ON((ios->layout->group_width > 1) &&
+ (si->unit_off + ios->length >
+ ios->layout->stripe_unit));
+ }
+ ios->numdevs = ios->layout->mirrors_p1;
+ return 0;
+ }
+
+ BUG_ON(length > si->group_length);
+
while (length) {
unsigned comp = dev - first_dev;
struct ore_per_dev_state *per_dev = &ios->per_dev[comp];
@@ -473,36 +490,6 @@ out:
return ret;
}

-static int _prepare_for_striping(struct ore_io_state *ios)
-{
- struct ore_striping_info si;
- int ret;
-
- if (!ios->pages) {
- if (ios->kern_buff) {
- struct ore_per_dev_state *per_dev = &ios->per_dev[0];
-
- ore_calc_stripe_info(ios->layout, ios->offset, &si);
- per_dev->offset = si.obj_offset;
- per_dev->dev = si.dev;
-
- /* no cross device without page array */
- BUG_ON((ios->layout->group_width > 1) &&
- (si.unit_off + ios->length >
- ios->layout->stripe_unit));
- }
- ios->numdevs = ios->layout->mirrors_p1;
- return 0;
- }
-
- ore_calc_stripe_info(ios->layout, ios->offset, &si);
-
- BUG_ON(ios->length > si.group_length);
- ret = _prepare_one_group(ios, ios->length, &si);
-
- return ret;
-}
-
int ore_create(struct ore_io_state *ios)
{
int i, ret;
diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h
index 8fefdfb..baeef02 100644
--- a/include/scsi/osd_ore.h
+++ b/include/scsi/osd_ore.h
@@ -93,6 +93,7 @@ typedef void (*ore_io_done_fn)(struct ore_io_state *ios, void *private);

struct ore_io_state {
struct kref kref;
+ struct ore_striping_info si;

void *private;
ore_io_done_fn done;
--
1.7.2.3


2011-10-04 12:20:25

by Jim Rees

[permalink] [raw]
Subject: Re: [PATCH 16/19] pnfs-obj: Rename objlayout_io_state => objlayout_io_res

Boaz Harrosh wrote:

* All instances of objlayout_io_state => objlayout_io_res
* All instances of state => oir;
* All instances of ol_state => oir;

Big but nothing to it

Signed-off-by: Boaz Harrosh <[email protected]>

You have trailing whitespace here:

+ * objlayout_{read,write}_done

2011-10-04 10:30:27

by Boaz Harrosh

[permalink] [raw]
Subject: [PATCH 05/19] ore: Only IO one group at a time (API change)

Usually a single IO is confined to one group of devices
(group_width) and at the boundary of a raid group it can
spill into a second group. Current code would allocate a
full device_table size array at each io_state so it can
comply to requests that span two groups. Needless to say
that is very wasteful, specially when device_table count
can get very large (hundreds even thousands), while a
group_width is usually 8 or 10.

* Change ore API to trim on IO that spans two raid groups.
The user passes offset+length to ore_get_rw_state, the
ore might trim on that length if spanning a group boundary.
The user must check ios->length or ios->nrpages to see
how much IO will be preformed. It is the responsibility
of the user to re-issue the reminder of the IO.

* Modify exofs To copy spilled pages on to the next IO.
This means one last kick is needed after all coalescing
of pages is done.

Signed-off-by: Boaz Harrosh <[email protected]>
---
fs/exofs/inode.c | 95 ++++++++++++++++++++++++++++++++++++++++++--------
fs/exofs/ore.c | 103 ++++++++++++++++++++++++++++++++++++-----------------
2 files changed, 150 insertions(+), 48 deletions(-)

diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 61b2f7e..14e408b 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -259,6 +259,46 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
}
}

+static int _maybe_not_all_in_one_io(struct ore_io_state *ios,
+ struct page_collect *pcol_src, struct page_collect *pcol)
+{
+ /* length was wrong or offset was not page aligned */
+ BUG_ON(pcol_src->nr_pages < ios->nr_pages);
+
+ if (pcol_src->nr_pages > ios->nr_pages) {
+ struct page **src_page;
+ unsigned pages_less = pcol_src->nr_pages - ios->nr_pages;
+ unsigned long len_less = pcol_src->length - ios->length;
+ unsigned i;
+ int ret;
+
+ /* This IO was trimmed */
+ pcol_src->nr_pages = ios->nr_pages;
+ pcol_src->length = ios->length;
+
+ /* Left over pages are passed to the next io */
+ pcol->expected_pages += pages_less;
+ pcol->nr_pages = pages_less;
+ pcol->length = len_less;
+ src_page = pcol_src->pages + pcol_src->nr_pages;
+ pcol->pg_first = (*src_page)->index;
+
+ ret = pcol_try_alloc(pcol);
+ if (unlikely(ret))
+ return ret;
+
+ for (i = 0; i < pages_less; ++i)
+ pcol->pages[i] = *src_page++;
+
+ EXOFS_DBGMSG("Length was adjusted nr_pages=0x%x pages_less=%d "
+ "expected_pages=0x%x next_offset=0x%llx "
+ "next_len=0x%lx\n",
+ pcol_src->nr_pages, pages_less, pcol->expected_pages,
+ pcol->pg_first * PAGE_SIZE, pcol->length);
+ }
+ return 0;
+}
+
static int read_exec(struct page_collect *pcol)
{
struct exofs_i_info *oi = exofs_i(pcol->inode);
@@ -280,7 +320,6 @@ static int read_exec(struct page_collect *pcol)

ios = pcol->ios;
ios->pages = pcol->pages;
- ios->nr_pages = pcol->nr_pages;

if (pcol->read_4_write) {
ore_read(pcol->ios);
@@ -296,17 +335,23 @@ static int read_exec(struct page_collect *pcol)
*pcol_copy = *pcol;
ios->done = readpages_done;
ios->private = pcol_copy;
+
+ /* pages ownership was passed to pcol_copy */
+ _pcol_reset(pcol);
+
+ ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
+ if (unlikely(ret))
+ goto err;
+
+ EXOFS_DBGMSG2("read_exec(0x%lx) offset=0x%llx length=0x%llx\n",
+ pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
+
ret = ore_read(ios);
if (unlikely(ret))
goto err;

atomic_inc(&pcol->sbi->s_curr_pending);

- EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
- oi->one_comp.obj.id, _LLU(ios->offset), pcol->length);
-
- /* pages ownership was passed to pcol_copy */
- _pcol_reset(pcol);
return 0;

err:
@@ -429,6 +474,10 @@ static int exofs_readpages(struct file *file, struct address_space *mapping,
return ret;
}

+ ret = read_exec(&pcol);
+ if (unlikely(ret))
+ return ret;
+
return read_exec(&pcol);
}

@@ -519,7 +568,6 @@ static int write_exec(struct page_collect *pcol)
ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false,
pcol->pg_first << PAGE_CACHE_SHIFT,
pcol->length, &pcol->ios);
-
if (unlikely(ret))
goto err;

@@ -534,10 +582,19 @@ static int write_exec(struct page_collect *pcol)

ios = pcol->ios;
ios->pages = pcol_copy->pages;
- ios->nr_pages = pcol_copy->nr_pages;
ios->done = writepages_done;
ios->private = pcol_copy;

+ /* pages ownership was passed to pcol_copy */
+ _pcol_reset(pcol);
+
+ ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
+ if (unlikely(ret))
+ goto err;
+
+ EXOFS_DBGMSG2("write_exec(0x%lx) offset=0x%llx length=0x%llx\n",
+ pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
+
ret = ore_write(ios);
if (unlikely(ret)) {
EXOFS_ERR("write_exec: ore_write() Failed\n");
@@ -545,11 +602,6 @@ static int write_exec(struct page_collect *pcol)
}

atomic_inc(&pcol->sbi->s_curr_pending);
- EXOFS_DBGMSG2("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
- pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset),
- pcol->length);
- /* pages ownership was passed to pcol_copy */
- _pcol_reset(pcol);
return 0;

err:
@@ -689,12 +741,25 @@ static int exofs_writepages(struct address_space *mapping,
_pcol_init(&pcol, expected_pages, mapping->host);

ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol);
- if (ret) {
+ if (unlikely(ret)) {
EXOFS_ERR("write_cache_pages => %d\n", ret);
return ret;
}

- return write_exec(&pcol);
+ ret = write_exec(&pcol);
+ if (unlikely(ret))
+ return ret;
+
+ if (wbc->sync_mode == WB_SYNC_ALL) {
+ return write_exec(&pcol); /* pump the last reminder */
+ } else {/* not SYNC let the reminder join the next writeout */
+ unsigned i;
+
+ for (i = 0; i < pcol.nr_pages; i++)
+ unlock_page(pcol.pages[i]);
+
+ return 0;
+ }
}

static int exofs_writepage(struct page *page, struct writeback_control *wbc)
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index a7d7925..d54af35 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -47,6 +47,9 @@ MODULE_AUTHOR("Boaz Harrosh <[email protected]>");
MODULE_DESCRIPTION("Objects Raid Engine ore.ko");
MODULE_LICENSE("GPL");

+static void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
+ struct ore_striping_info *si);
+
static u8 *_ios_cred(struct ore_io_state *ios, unsigned index)
{
return ios->oc->comps[index & ios->oc->single_comp].cred;
@@ -62,38 +65,85 @@ static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index)
return ore_comp_dev(ios->oc, index);
}

-int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
- bool is_reading, u64 offset, u64 length,
- struct ore_io_state **pios)
+static int _get_io_state(struct ore_layout *layout,
+ struct ore_components *oc, unsigned numdevs,
+ struct ore_io_state **pios)
{
struct ore_io_state *ios;

/*TODO: Maybe use kmem_cach per sbi of size
* exofs_io_state_size(layout->s_numdevs)
*/
- ios = kzalloc(ore_io_state_size(oc->numdevs), GFP_KERNEL);
+ ios = kzalloc(ore_io_state_size(numdevs), GFP_KERNEL);
if (unlikely(!ios)) {
ORE_DBGMSG("Failed kzalloc bytes=%d\n",
- ore_io_state_size(oc->numdevs));
+ ore_io_state_size(numdevs));
*pios = NULL;
return -ENOMEM;
}

ios->layout = layout;
ios->oc = oc;
- ios->offset = offset;
- ios->length = length;
+ *pios = ios;
+ return 0;
+}
+
+/* Allocate an io_state for only a single group of devices
+ *
+ * If a user needs to call ore_read/write() this version must be used becase it
+ * allocates extra stuff for striping and raid.
+ * The ore might decide to only IO less then @length bytes do to alignmets
+ * and constrains as follows:
+ * - The IO cannot cross group boundary.
+ * - In raid5/6 The end of the IO must align at end of a stripe eg.
+ * (@offset + @length) % strip_size == 0. Or the complete range is within a
+ * single stripe.
+ * - Memory condition only permitted a shorter IO. (A user can use @length=~0
+ * And check the returned ios->length for max_io_size.)
+ *
+ * The caller must check returned ios->length (and/or ios->nr_pages) and
+ * re-issue these pages that fall outside of ios->length
+ */
+int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
+ bool is_reading, u64 offset, u64 length,
+ struct ore_io_state **pios)
+{
+ struct ore_io_state *ios;
+ unsigned numdevs = layout->group_width * layout->mirrors_p1;
+ int ret;
+
+ ret = _get_io_state(layout, oc, numdevs, pios);
+ if (unlikely(ret))
+ return ret;
+
+ ios = *pios;
ios->reading = is_reading;
+ ios->offset = offset;
+
+ if (length) {
+ struct ore_striping_info si;
+
+ ore_calc_stripe_info(layout, offset, &si);
+ ios->length = (length <= si.group_length) ? length :
+ si.group_length;
+ ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE;
+ }

- *pios = ios;
return 0;
}
EXPORT_SYMBOL(ore_get_rw_state);

+/* Allocate an io_state for all the devices in the comps array
+ *
+ * This version of io_state allocation is used mostly by create/remove
+ * and trunc where we currently need all the devices. The only wastful
+ * bit is the read/write_attributes with no IO. Those sites should
+ * be converted to use ore_get_rw_state() with length=0
+ */
int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc,
- struct ore_io_state **ios)
+ struct ore_io_state **pios)
{
- return ore_get_rw_state(layout, oc, true, 0, 0, ios);
+ return _get_io_state(layout, oc, oc->numdevs, pios);
}
EXPORT_SYMBOL(ore_get_io_state);

@@ -379,7 +429,8 @@ static int _prepare_one_group(struct ore_io_state *ios, u64 length,
int ret = 0;

while (length) {
- struct ore_per_dev_state *per_dev = &ios->per_dev[dev];
+ unsigned comp = dev - first_dev;
+ struct ore_per_dev_state *per_dev = &ios->per_dev[comp];
unsigned cur_len, page_off = 0;

if (!per_dev->length) {
@@ -398,8 +449,8 @@ static int _prepare_one_group(struct ore_io_state *ios, u64 length,
cur_len = stripe_unit;
}

- if (max_comp < dev)
- max_comp = dev;
+ if (max_comp < comp)
+ max_comp = comp;
} else {
cur_len = stripe_unit;
}
@@ -424,10 +475,8 @@ out:

static int _prepare_for_striping(struct ore_io_state *ios)
{
- u64 length = ios->length;
- u64 offset = ios->offset;
struct ore_striping_info si;
- int ret = 0;
+ int ret;

if (!ios->pages) {
if (ios->kern_buff) {
@@ -446,21 +495,11 @@ static int _prepare_for_striping(struct ore_io_state *ios)
return 0;
}

- while (length) {
- ore_calc_stripe_info(ios->layout, offset, &si);
-
- if (length < si.group_length)
- si.group_length = length;
+ ore_calc_stripe_info(ios->layout, ios->offset, &si);

- ret = _prepare_one_group(ios, si.group_length, &si);
- if (unlikely(ret))
- goto out;
+ BUG_ON(ios->length > si.group_length);
+ ret = _prepare_one_group(ios, ios->length, &si);

- offset += si.group_length;
- length -= si.group_length;
- }
-
-out:
return ret;
}

@@ -742,7 +781,6 @@ struct _trunc_info {

unsigned first_group_dev;
unsigned nex_group_dev;
- unsigned max_devs;
};

static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
@@ -757,7 +795,6 @@ static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,

ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width);
ti->nex_group_dev = ti->first_group_dev + layout->group_width;
- ti->max_devs = layout->group_width * layout->group_count;
}

int ore_truncate(struct ore_layout *layout, struct ore_components *oc,
@@ -777,7 +814,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *oc,

_calc_trunk_info(ios->layout, size, &ti);

- size_attrs = kcalloc(ti.max_devs, sizeof(*size_attrs),
+ size_attrs = kcalloc(ios->oc->numdevs, sizeof(*size_attrs),
GFP_KERNEL);
if (unlikely(!size_attrs)) {
ret = -ENOMEM;
@@ -786,7 +823,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *oc,

ios->numdevs = ios->oc->numdevs;

- for (i = 0; i < ti.max_devs; ++i) {
+ for (i = 0; i < ios->numdevs; ++i) {
struct exofs_trunc_attr *size_attr = &size_attrs[i];
u64 obj_size;

--
1.7.2.3


2011-10-04 10:35:30

by Boaz Harrosh

[permalink] [raw]
Subject: [PATCH 14/19] pnfs-obj: Return PNFS_NOT_ATTEMPTED in case of read/write_pagelist

objlayout driver was always returning PNFS_ATTEMPTED from it's
read/write_pagelist operations. Even on error. Fix that.

Start by establishing an error return API from io-engine, by
not returning ssize_t (length-or-error) but returning "int"
0=OK, 0>Error. And clean up all return types in io-engine.

Then if io-engine returned error return PNFS_NOT_ATTEMPTED
to generic layer. (With a dprint)

Signed-off-by: Boaz Harrosh <[email protected]>
---
fs/nfs/objlayout/objio_osd.c | 32 ++++++++++++++++----------------
fs/nfs/objlayout/objlayout.c | 36 +++++++++++++++++++-----------------
fs/nfs/objlayout/objlayout.h | 4 ++--
3 files changed, 37 insertions(+), 35 deletions(-)

diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index d0cda12..0c7c9ec 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -142,7 +142,7 @@ OBJIO_LSEG(struct pnfs_layout_segment *lseg)
}

struct objio_state;
-typedef ssize_t (*objio_done_fn)(struct objio_state *ios);
+typedef int (*objio_done_fn)(struct objio_state *ios);

struct objio_state {
/* Generic layer */
@@ -720,7 +720,7 @@ out:
return 0;
}

-static ssize_t _sync_done(struct objio_state *ios)
+static int _sync_done(struct objio_state *ios)
{
struct completion *waiting = ios->private;

@@ -742,10 +742,10 @@ static void _done_io(struct osd_request *or, void *p)
kref_put(&ios->kref, _last_io);
}

-static ssize_t _io_exec(struct objio_state *ios)
+static int _io_exec(struct objio_state *ios)
{
DECLARE_COMPLETION_ONSTACK(wait);
- ssize_t status = 0; /* sync status */
+ int ret = 0;
unsigned i;
objio_done_fn saved_done_fn = ios->done;
bool sync = ios->ol_state.sync;
@@ -771,16 +771,16 @@ static ssize_t _io_exec(struct objio_state *ios)

if (sync) {
wait_for_completion(&wait);
- status = saved_done_fn(ios);
+ ret = saved_done_fn(ios);
}

- return status;
+ return ret;
}

/*
* read
*/
-static ssize_t _read_done(struct objio_state *ios)
+static int _read_done(struct objio_state *ios)
{
ssize_t status;
int ret = _io_check(ios, false);
@@ -793,7 +793,7 @@ static ssize_t _read_done(struct objio_state *ios)
status = ret;

objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync);
- return status;
+ return ret;
}

static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
@@ -833,7 +833,7 @@ err:
return ret;
}

-static ssize_t _read_exec(struct objio_state *ios)
+static int _read_exec(struct objio_state *ios)
{
unsigned i;
int ret;
@@ -847,14 +847,14 @@ static ssize_t _read_exec(struct objio_state *ios)
}

ios->done = _read_done;
- return _io_exec(ios); /* In sync mode exec returns the io status */
+ return _io_exec(ios);

err:
_io_free(ios);
return ret;
}

-ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state)
+int objio_read_pagelist(struct objlayout_io_state *ol_state)
{
struct objio_state *ios = container_of(ol_state, struct objio_state,
ol_state);
@@ -870,7 +870,7 @@ ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state)
/*
* write
*/
-static ssize_t _write_done(struct objio_state *ios)
+static int _write_done(struct objio_state *ios)
{
ssize_t status;
int ret = _io_check(ios, true);
@@ -887,7 +887,7 @@ static ssize_t _write_done(struct objio_state *ios)
}

objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync);
- return status;
+ return ret;
}

static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
@@ -955,7 +955,7 @@ err:
return ret;
}

-static ssize_t _write_exec(struct objio_state *ios)
+static int _write_exec(struct objio_state *ios)
{
unsigned i;
int ret;
@@ -969,14 +969,14 @@ static ssize_t _write_exec(struct objio_state *ios)
}

ios->done = _write_done;
- return _io_exec(ios); /* In sync mode exec returns the io->status */
+ return _io_exec(ios);

err:
_io_free(ios);
return ret;
}

-ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable)
+int objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable)
{
struct objio_state *ios = container_of(ol_state, struct objio_state,
ol_state);
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 1300736..99c807d 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -315,16 +315,13 @@ objlayout_read_pagelist(struct nfs_read_data *rdata)
loff_t offset = rdata->args.offset;
size_t count = rdata->args.count;
struct objlayout_io_state *state;
- ssize_t status = 0;
+ int err;
loff_t eof;

- dprintk("%s: Begin inode %p offset %llu count %d\n",
- __func__, rdata->inode, offset, (int)count);
-
eof = i_size_read(rdata->inode);
if (unlikely(offset + count > eof)) {
if (offset >= eof) {
- status = 0;
+ err = 0;
rdata->res.count = 0;
rdata->res.eof = 1;
/*FIXME: do we need to call pnfs_ld_read_done() */
@@ -341,14 +338,19 @@ objlayout_read_pagelist(struct nfs_read_data *rdata)
rdata->lseg, rdata,
GFP_KERNEL);
if (unlikely(!state)) {
- status = -ENOMEM;
+ err = -ENOMEM;
goto out;
}
+ dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n",
+ __func__, rdata->inode->i_ino, offset, count, rdata->res.eof);

- status = objio_read_pagelist(state);
+ err = objio_read_pagelist(state);
out:
- dprintk("%s: Return status %Zd\n", __func__, status);
- rdata->pnfs_error = status;
+ if (unlikely(err)) {
+ rdata->pnfs_error = err;
+ dprintk("%s: Returned Error %d\n", __func__, err);
+ return PNFS_NOT_ATTEMPTED;
+ }
return PNFS_ATTEMPTED;
}

@@ -406,10 +408,7 @@ objlayout_write_pagelist(struct nfs_write_data *wdata,
int how)
{
struct objlayout_io_state *state;
- ssize_t status;
-
- dprintk("%s: Begin inode %p offset %llu count %u\n",
- __func__, wdata->inode, wdata->args.offset, wdata->args.count);
+ int err;

state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout,
wdata->args.pages,
@@ -419,16 +418,19 @@ objlayout_write_pagelist(struct nfs_write_data *wdata,
wdata->lseg, wdata,
GFP_NOFS);
if (unlikely(!state)) {
- status = -ENOMEM;
+ err = -ENOMEM;
goto out;
}

state->sync = how & FLUSH_SYNC;

- status = objio_write_pagelist(state, how & FLUSH_STABLE);
+ err = objio_write_pagelist(state, how & FLUSH_STABLE);
out:
- dprintk("%s: Return status %Zd\n", __func__, status);
- wdata->pnfs_error = status;
+ if (unlikely(err)) {
+ wdata->pnfs_error = err;
+ dprintk("%s: Returned Error %d\n", __func__, err);
+ return PNFS_NOT_ATTEMPTED;
+ }
return PNFS_ATTEMPTED;
}

diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index ffb884c..4edac9b 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -115,8 +115,8 @@ extern int objio_alloc_io_state(
gfp_t gfp_flags);
extern void objio_free_io_state(struct objlayout_io_state *state);

-extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state);
-extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state,
+extern int objio_read_pagelist(struct objlayout_io_state *ol_state);
+extern int objio_write_pagelist(struct objlayout_io_state *ol_state,
bool stable);

/*
--
1.7.2.3


2011-10-04 10:32:10

by Boaz Harrosh

[permalink] [raw]
Subject: [PATCH 08/19] exofs: Support for short read/writes

If at read/write_done the actual IO was shorter then requested,
reported in returned ios->length. It is not an error. The reminder
of the pages should just be unlocked but not marked uptodate or
end_page_writeback. They will be re issued later by the VFS.

Signed-off-by: Boaz Harrosh <[email protected]>
---
fs/exofs/inode.c | 35 ++++++++++++++++++++++++++---------
1 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 14e408b..0afe761 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -149,14 +149,17 @@ static int pcol_add_page(struct page_collect *pcol, struct page *page,
return 0;
}

+enum {PAGE_WAS_NOT_IN_IO = 17};
static int update_read_page(struct page *page, int ret)
{
- if (ret == 0) {
+ switch (ret) {
+ case 0:
/* Everything is OK */
SetPageUptodate(page);
if (PageError(page))
ClearPageError(page);
- } else if (ret == -EFAULT) {
+ break;
+ case -EFAULT:
/* In this case we were trying to read something that wasn't on
* disk yet - return a page full of zeroes. This should be OK,
* because the object should be empty (if there was a write
@@ -167,16 +170,22 @@ static int update_read_page(struct page *page, int ret)
SetPageUptodate(page);
if (PageError(page))
ClearPageError(page);
- ret = 0; /* recovered error */
EXOFS_DBGMSG("recovered read error\n");
- } else /* Error */
+ /* fall through */
+ case PAGE_WAS_NOT_IN_IO:
+ ret = 0; /* recovered error */
+ break;
+ default:
SetPageError(page);
-
+ }
return ret;
}

static void update_write_page(struct page *page, int ret)
{
+ if (unlikely(ret == PAGE_WAS_NOT_IN_IO))
+ return; /* don't pass start don't collect $200 */
+
if (ret) {
mapping_set_error(page->mapping, ret);
SetPageError(page);
@@ -195,10 +204,14 @@ static int __readpages_done(struct page_collect *pcol)
u64 length = 0;
int ret = ore_check_io(pcol->ios, &resid);

- if (likely(!ret))
+ if (likely(!ret)) {
good_bytes = pcol->length;
- else
+ ret = PAGE_WAS_NOT_IN_IO;
+ } else {
good_bytes = pcol->length - resid;
+ }
+ if (good_bytes > pcol->ios->length)
+ good_bytes = pcol->ios->length;

EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx"
" length=0x%lx nr_pages=%u\n",
@@ -518,10 +531,14 @@ static void writepages_done(struct ore_io_state *ios, void *p)

atomic_dec(&pcol->sbi->s_curr_pending);

- if (likely(!ret))
+ if (likely(!ret)) {
good_bytes = pcol->length;
- else
+ ret = PAGE_WAS_NOT_IN_IO;
+ } else {
good_bytes = pcol->length - resid;
+ }
+ if (good_bytes > pcol->ios->length)
+ good_bytes = pcol->ios->length;

EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx"
" length=0x%lx nr_pages=%u\n",
--
1.7.2.3


2011-10-04 12:24:25

by Boaz Harrosh

[permalink] [raw]
Subject: Re: [PATCHSET 00/19] objlayout: Move to ORE

On 10/04/2011 02:04 PM, Benny Halevy wrote:
> Boaz, please change my email address in your contacts lists
> to [email protected], thanks...
>

Rrrr really sorry. I did change it every where except in that
my-send-patches script. Changed now

Sorry
Boaz

2011-10-04 10:31:50

by Boaz Harrosh

[permalink] [raw]
Subject: [PATCH 07/19] ore: Remove check for ios->kern_buff in _prepare_for_striping to later

Move the check and preparation of the ios->kern_buff case to
later inside _write_mirror().

Since read was never used with ios->kern_buff its support is removed
instead of fixed.

Signed-off-by: Boaz Harrosh <[email protected]>
---
fs/exofs/ore.c | 36 +++++++++++++-----------------------
1 files changed, 13 insertions(+), 23 deletions(-)

diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 43a53ba..2f39f23 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -428,17 +428,6 @@ static int _prepare_for_striping(struct ore_io_state *ios)
int ret = 0;

if (!ios->pages) {
- if (ios->kern_buff) {
- struct ore_per_dev_state *per_dev = &ios->per_dev[0];
-
- per_dev->offset = si->obj_offset;
- per_dev->dev = si->dev;
-
- /* no cross device without page array */
- BUG_ON((ios->layout->group_width > 1) &&
- (si->unit_off + ios->length >
- ios->layout->stripe_unit));
- }
ios->numdevs = ios->layout->mirrors_p1;
return 0;
}
@@ -561,7 +550,6 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
goto out;
}
per_dev->or = or;
- per_dev->offset = master_dev->offset;

if (ios->pages) {
struct bio *bio;
@@ -580,6 +568,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
__bio_clone(bio, master_dev->bio);
bio->bi_bdev = NULL;
bio->bi_next = NULL;
+ per_dev->offset = master_dev->offset;
per_dev->length = master_dev->length;
per_dev->bio = bio;
per_dev->dev = dev;
@@ -597,7 +586,15 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
_LLU(per_dev->offset),
_LLU(per_dev->length), dev);
} else if (ios->kern_buff) {
- ret = osd_req_write_kern(or, _ios_obj(ios, dev),
+ per_dev->offset = ios->si.obj_offset;
+ per_dev->dev = ios->si.dev + dev;
+
+ /* no cross device without page array */
+ BUG_ON((ios->layout->group_width > 1) &&
+ (ios->si.unit_off + ios->length >
+ ios->layout->stripe_unit));
+
+ ret = osd_req_write_kern(or, _ios_obj(ios, per_dev->dev),
per_dev->offset,
ios->kern_buff, ios->length);
if (unlikely(ret))
@@ -606,7 +603,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
"length=0x%llx dev=%d\n",
_LLU(_ios_obj(ios, dev)->id),
_LLU(per_dev->offset),
- _LLU(ios->length), dev);
+ _LLU(ios->length), per_dev->dev);
} else {
osd_req_set_attributes(or, _ios_obj(ios, dev));
ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
@@ -672,16 +669,9 @@ static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp)
" dev=%d\n", _LLU(obj->id),
_LLU(per_dev->offset), _LLU(per_dev->length),
first_dev);
- } else if (ios->kern_buff) {
- int ret = osd_req_read_kern(or, obj, per_dev->offset,
- ios->kern_buff, ios->length);
- ORE_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
- "length=0x%llx dev=%d ret=>%d\n",
- _LLU(obj->id), _LLU(per_dev->offset),
- _LLU(ios->length), first_dev, ret);
- if (unlikely(ret))
- return ret;
} else {
+ BUG_ON(ios->kern_buff);
+
osd_req_get_attributes(or, obj);
ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
_LLU(obj->id),
--
1.7.2.3


2011-10-04 12:28:29

by Boaz Harrosh

[permalink] [raw]
Subject: Re: [PATCH 16/19] pnfs-obj: Rename objlayout_io_state => objlayout_io_res

On 10/04/2011 02:20 PM, Jim Rees wrote:
> Boaz Harrosh wrote:
>
> * All instances of objlayout_io_state => objlayout_io_res
> * All instances of state => oir;
> * All instances of ol_state => oir;
>
> Big but nothing to it
>
> Signed-off-by: Boaz Harrosh <[email protected]>
>
> You have trailing whitespace here:
>
> + * objlayout_{read,write}_done

Thanks! of course that was the very last change I did, to add that
comment. Will repost

Thanks
Boaz


2011-10-04 10:29:23

by Boaz Harrosh

[permalink] [raw]
Subject: [PATCH 03/19] ore: Make ore_striping_info and ore_calc_stripe_info public

The struct ore_striping_info will be used later in other
structures. And ore_calc_stripe_info as well. Rename them
make struct ore_striping_info public. ore_calc_stripe_info
is still static, will be made public on first use.

Signed-off-by: Boaz Harrosh <[email protected]>
---
fs/exofs/ore.c | 24 ++++++++----------------
include/scsi/osd_ore.h | 8 ++++++++
2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 870f85a..c2b0033 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -287,16 +287,8 @@ EXPORT_SYMBOL(ore_check_io);
*
* O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit
*/
-struct _striping_info {
- u64 obj_offset;
- u64 group_length;
- u64 M; /* for truncate */
- unsigned dev;
- unsigned unit_off;
-};
-
-static void _calc_stripe_info(struct ore_layout *layout, u64 file_offset,
- struct _striping_info *si)
+static void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
+ struct ore_striping_info *si)
{
u32 stripe_unit = layout->stripe_unit;
u32 group_width = layout->group_width;
@@ -375,7 +367,7 @@ static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
}

static int _prepare_one_group(struct ore_io_state *ios, u64 length,
- struct _striping_info *si)
+ struct ore_striping_info *si)
{
unsigned stripe_unit = ios->layout->stripe_unit;
unsigned mirrors_p1 = ios->layout->mirrors_p1;
@@ -434,14 +426,14 @@ static int _prepare_for_striping(struct ore_io_state *ios)
{
u64 length = ios->length;
u64 offset = ios->offset;
- struct _striping_info si;
+ struct ore_striping_info si;
int ret = 0;

if (!ios->pages) {
if (ios->kern_buff) {
struct ore_per_dev_state *per_dev = &ios->per_dev[0];

- _calc_stripe_info(ios->layout, ios->offset, &si);
+ ore_calc_stripe_info(ios->layout, ios->offset, &si);
per_dev->offset = si.obj_offset;
per_dev->dev = si.dev;

@@ -455,7 +447,7 @@ static int _prepare_for_striping(struct ore_io_state *ios)
}

while (length) {
- _calc_stripe_info(ios->layout, offset, &si);
+ ore_calc_stripe_info(ios->layout, offset, &si);

if (length < si.group_length)
si.group_length = length;
@@ -744,7 +736,7 @@ static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp,
}

struct _trunc_info {
- struct _striping_info si;
+ struct ore_striping_info si;
u64 prev_group_obj_off;
u64 next_group_obj_off;

@@ -758,7 +750,7 @@ static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
{
unsigned stripe_unit = layout->stripe_unit;

- _calc_stripe_info(layout, file_offset, &ti->si);
+ ore_calc_stripe_info(layout, file_offset, &ti->si);

ti->prev_group_obj_off = ti->si.M * stripe_unit;
ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0;
diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h
index f7fabb4..e4d550f 100644
--- a/include/scsi/osd_ore.h
+++ b/include/scsi/osd_ore.h
@@ -56,6 +56,14 @@ struct ore_components {
struct osd_dev **ods; /* osd_dev array */
};

+struct ore_striping_info {
+ u64 obj_offset;
+ u64 group_length;
+ u64 M; /* for truncate */
+ unsigned dev;
+ unsigned unit_off;
+};
+
struct ore_io_state;
typedef void (*ore_io_done_fn)(struct ore_io_state *ios, void *private);

--
1.7.2.3


2011-10-04 10:29:57

by Boaz Harrosh

[permalink] [raw]
Subject: [PATCH 04/19] ore/exofs: Change the type of the devices array (API change)

In the pNFS obj-LD the device table at the layout level needs
to point to a device_cache node, where it is possible and likely
that many layouts will point to the same device-nodes.

In Exofs we have a more orderly structure where we have a single
array of devices that repeats twice for a round-robin view of the
device table

This patch moves to a model that can be used by the pNFS obj-LD
where struct ore_components holds an array of ore_dev-pointers.
(ore_dev is newly defined and contains a struct osd_dev *od
member)

Each pointer in the array of pointers will point to a bigger
user-defined dev_struct. That can be accessed by use of the
container_of macro.

In Exofs an __alloc_dev_table() function allocates the
ore_dev-pointers array as well as an exofs_dev array, in one
allocation and does the addresses dance to set everything pointing
correctly. It still keeps the double allocation trick for the
inodes round-robin view of the table.

The device table is always allocated dynamically, also for the
single device case. So it is unconditionally freed at umount.

Signed-off-by: Boaz Harrosh <[email protected]>
---
fs/exofs/exofs.h | 10 +++-
fs/exofs/ore.c | 2 +-
fs/exofs/super.c | 99 +++++++++++++++++++++++++++++------------------
include/scsi/osd_ore.h | 26 ++++++++++++-
4 files changed, 94 insertions(+), 43 deletions(-)

diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 3b2e047..006fd6f 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -53,6 +53,10 @@
/* u64 has problems with printk this will cast it to unsigned long long */
#define _LLU(x) (unsigned long long)(x)

+struct exofs_dev {
+ struct ore_dev ored;
+ unsigned did;
+};
/*
* our extension to the in-memory superblock
*/
@@ -69,7 +73,6 @@ struct exofs_sb_info {
struct ore_layout layout; /* Default files layout */
struct ore_comp one_comp; /* id & cred of partition id=0*/
struct ore_components oc; /* comps for the partition */
- struct osd_dev *_min_one_dev[1]; /* Place holder for one dev */
};

/*
@@ -214,13 +217,14 @@ static inline void exofs_init_comps(struct ore_components *oc,
one_comp->obj.id = oid;
exofs_make_credential(one_comp->cred, &one_comp->obj);

- oc->numdevs = sbi->oc.numdevs;
+ oc->numdevs = sbi->layout.group_width * sbi->layout.mirrors_p1 *
+ sbi->layout.group_count;
oc->single_comp = EC_SINGLE_COMP;
oc->comps = one_comp;

/* Round robin device view of the table */
first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->oc.numdevs;
- oc->ods = sbi->oc.ods + first_dev;
+ oc->ods = &sbi->oc.ods[first_dev];
}

#endif
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index c2b0033..a7d7925 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -59,7 +59,7 @@ static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index)

static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index)
{
- return ios->oc->ods[index];
+ return ore_comp_dev(ios->oc, index);
}

int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 90b4c52..bce3686 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -431,17 +431,18 @@ static void _exofs_print_device(const char *msg, const char *dev_path,

static void exofs_free_sbi(struct exofs_sb_info *sbi)
{
- while (sbi->oc.numdevs) {
- int i = --sbi->oc.numdevs;
- struct osd_dev *od = sbi->oc.ods[i];
+ unsigned numdevs = sbi->oc.numdevs;
+
+ while (numdevs) {
+ unsigned i = --numdevs;
+ struct osd_dev *od = ore_comp_dev(&sbi->oc, i);

if (od) {
- sbi->oc.ods[i] = NULL;
+ ore_comp_set_dev(&sbi->oc, i, NULL);
osduld_put_device(od);
}
}
- if (sbi->oc.ods != sbi->_min_one_dev)
- kfree(sbi->oc.ods);
+ kfree(sbi->oc.ods);
kfree(sbi);
}

@@ -468,7 +469,7 @@ static void exofs_put_super(struct super_block *sb)
msecs_to_jiffies(100));
}

- _exofs_print_device("Unmounting", NULL, sbi->oc.ods[0],
+ _exofs_print_device("Unmounting", NULL, ore_comp_dev(&sbi->oc, 0),
sbi->one_comp.obj.partition);

bdi_destroy(&sbi->bdi);
@@ -592,12 +593,40 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
return !(odi->systemid_len || odi->osdname_len);
}

+int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs,
+ struct exofs_dev **peds)
+{
+ struct __alloc_ore_devs_and_exofs_devs {
+ /* Twice bigger table: See exofs_init_comps() and comment at
+ * exofs_read_lookup_dev_table()
+ */
+ struct ore_dev *oreds[numdevs * 2 - 1];
+ struct exofs_dev eds[numdevs];
+ } *aoded;
+ struct exofs_dev *eds;
+ unsigned i;
+
+ aoded = kzalloc(sizeof(*aoded), GFP_KERNEL);
+ if (unlikely(!aoded)) {
+ EXOFS_ERR("ERROR: faild allocating Device array[%d]\n",
+ numdevs);
+ return -ENOMEM;
+ }
+
+ sbi->oc.ods = aoded->oreds;
+ *peds = eds = aoded->eds;
+ for (i = 0; i < numdevs; ++i)
+ aoded->oreds[i] = &eds[i].ored;
+ return 0;
+}
+
static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
struct osd_dev *fscb_od,
unsigned table_count)
{
struct ore_comp comp;
struct exofs_device_table *dt;
+ struct exofs_dev *eds;
unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) +
sizeof(*dt);
unsigned numdevs, i;
@@ -634,20 +663,16 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
if (unlikely(ret))
goto out;

- if (likely(numdevs > 1)) {
- unsigned size = numdevs * sizeof(sbi->oc.ods[0]);
-
- /* Twice bigger table: See exofs_init_comps() and below
- * comment
- */
- sbi->oc.ods = kzalloc(size + size - 1, GFP_KERNEL);
- if (unlikely(!sbi->oc.ods)) {
- EXOFS_ERR("ERROR: faild allocating Device array[%d]\n",
- numdevs);
- ret = -ENOMEM;
- goto out;
- }
- }
+ ret = __alloc_dev_table(sbi, numdevs, &eds);
+ if (unlikely(ret))
+ goto out;
+ /* exofs round-robins the device table view according to inode
+ * number. We hold a: twice bigger table hence inodes can point
+ * to any device and have a sequential view of the table
+ * starting at this device. See exofs_init_comps()
+ */
+ memcpy(&sbi->oc.ods[numdevs], &sbi->oc.ods[0],
+ (numdevs - 1) * sizeof(sbi->oc.ods[0]));

for (i = 0; i < numdevs; i++) {
struct exofs_fscb fscb;
@@ -663,12 +688,15 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n",
i, odi.osdname);

+ /* the exofs id is currently the table index */
+ eds[i].did = i;
+
/* On all devices the device table is identical. The user can
* specify any one of the participating devices on the command
* line. We always keep them in device-table order.
*/
if (fscb_od && osduld_device_same(fscb_od, &odi)) {
- sbi->oc.ods[i] = fscb_od;
+ eds[i].ored.od = fscb_od;
++sbi->oc.numdevs;
fscb_od = NULL;
continue;
@@ -682,7 +710,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
goto out;
}

- sbi->oc.ods[i] = od;
+ eds[i].ored.od = od;
++sbi->oc.numdevs;

/* Read the fscb of the other devices to make sure the FS
@@ -705,21 +733,10 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,

out:
kfree(dt);
- if (likely(!ret)) {
- unsigned numdevs = sbi->oc.numdevs;
-
- if (unlikely(fscb_od)) {
+ if (unlikely(fscb_od && !ret)) {
EXOFS_ERR("ERROR: Bad device-table container device not present\n");
osduld_put_device(fscb_od);
return -EINVAL;
- }
- /* exofs round-robins the device table view according to inode
- * number. We hold a: twice bigger table hence inodes can point
- * to any device and have a sequential view of the table
- * starting at this device. See exofs_init_comps()
- */
- for (i = 0; i < numdevs - 1; ++i)
- sbi->oc.ods[i + numdevs] = sbi->oc.ods[i];
}
return ret;
}
@@ -773,7 +790,6 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
sbi->oc.numdevs = 1;
sbi->oc.single_comp = EC_SINGLE_COMP;
sbi->oc.comps = &sbi->one_comp;
- sbi->oc.ods = sbi->_min_one_dev;

/* fill in some other data by hand */
memset(sb->s_id, 0, sizeof(sb->s_id));
@@ -822,7 +838,13 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
if (unlikely(ret))
goto free_sbi;
} else {
- sbi->oc.ods[0] = od;
+ struct exofs_dev *eds;
+
+ ret = __alloc_dev_table(sbi, 1, &eds);
+ if (unlikely(ret))
+ goto free_sbi;
+
+ ore_comp_set_dev(&sbi->oc, 0, od);
}

__sbi_read_stats(sbi);
@@ -862,7 +884,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
goto free_sbi;
}

- _exofs_print_device("Mounting", opts->dev_name, sbi->oc.ods[0],
+ _exofs_print_device("Mounting", opts->dev_name,
+ ore_comp_dev(&sbi->oc, 0),
sbi->one_comp.obj.partition);
return 0;

diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h
index e4d550f..8fefdfb 100644
--- a/include/scsi/osd_ore.h
+++ b/include/scsi/osd_ore.h
@@ -44,6 +44,10 @@ struct ore_layout {
unsigned group_count;
};

+struct ore_dev {
+ struct osd_dev *od;
+};
+
struct ore_components {
unsigned numdevs; /* Num of devices in array */
/* If @single_comp == EC_SINGLE_COMP, @comps points to a single
@@ -53,9 +57,29 @@ struct ore_components {
EC_SINGLE_COMP = 0, EC_MULTPLE_COMPS = 0xffffffff
} single_comp;
struct ore_comp *comps;
- struct osd_dev **ods; /* osd_dev array */
+
+ /* Array of pointers to ore_dev-* . User will usually have these pointed
+ * too a bigger struct which contain an "ore_dev ored" member and use
+ * container_of(oc->ods[i], struct foo_dev, ored) to access the bigger
+ * structure.
+ */
+ struct ore_dev **ods;
};

+/* ore_comp_dev Recievies a logical device index */
+static inline struct osd_dev *ore_comp_dev(
+ const struct ore_components *oc, unsigned i)
+{
+ BUG_ON(oc->numdevs <= i);
+ return oc->ods[i]->od;
+}
+
+static inline void ore_comp_set_dev(
+ struct ore_components *oc, unsigned i, struct osd_dev *od)
+{
+ oc->ods[i]->od = od;
+}
+
struct ore_striping_info {
u64 obj_offset;
u64 group_length;
--
1.7.2.3


2011-10-04 10:33:29

by Boaz Harrosh

[permalink] [raw]
Subject: [PATCH 10/19] ore: Support for partial component table

Users like the objlayout-driver would like to only pass
a partial device table that covers the IO in question.
For example exofs divides the file into raid-group-sized
chunks and only serves group_width number of devices at
a time.

The partiality is communicated by setting
ore_componets->first_dev and the array covers all logical
devices from oc->first_dev upto (oc->first_dev + oc->numdevs)

The ore_comp_dev() API receives a logical device index
and returns the actual present device in the table.
An out-of-range dev_index will BUG.

Logical device index is the theoretical device index as if
all the devices of a file are present. .i.e:
total_devs = group_width * mirror_p1 * group_count
0 <= dev_index < total_devs

Signed-off-by: Boaz Harrosh <[email protected]>
---
fs/exofs/exofs.h | 1 +
fs/exofs/ore.c | 4 ++++
include/scsi/osd_ore.h | 7 ++++---
3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 006fd6f..51f4b4c 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -217,6 +217,7 @@ static inline void exofs_init_comps(struct ore_components *oc,
one_comp->obj.id = oid;
exofs_make_credential(one_comp->cred, &one_comp->obj);

+ oc->first_dev = 0;
oc->numdevs = sbi->layout.group_width * sbi->layout.mirrors_p1 *
sbi->layout.group_count;
oc->single_comp = EC_SINGLE_COMP;
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 0b992e1..7913168 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -62,6 +62,10 @@ static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index)

static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index)
{
+ ORE_DBGMSG2("oc->first_dev=%d oc->numdevs=%d i=%d oc->ods=%p\n",
+ ios->oc->first_dev, ios->oc->numdevs, index,
+ ios->oc->ods);
+
return ore_comp_dev(ios->oc, index);
}

diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h
index baeef02..492b70d 100644
--- a/include/scsi/osd_ore.h
+++ b/include/scsi/osd_ore.h
@@ -49,6 +49,7 @@ struct ore_dev {
};

struct ore_components {
+ unsigned first_dev; /* First logical device no */
unsigned numdevs; /* Num of devices in array */
/* If @single_comp == EC_SINGLE_COMP, @comps points to a single
* component. else there are @numdevs components
@@ -70,14 +71,14 @@ struct ore_components {
static inline struct osd_dev *ore_comp_dev(
const struct ore_components *oc, unsigned i)
{
- BUG_ON(oc->numdevs <= i);
- return oc->ods[i]->od;
+ BUG_ON((i < oc->first_dev) || (oc->first_dev + oc->numdevs <= i));
+ return oc->ods[i - oc->first_dev]->od;
}

static inline void ore_comp_set_dev(
struct ore_components *oc, unsigned i, struct osd_dev *od)
{
- oc->ods[i]->od = od;
+ oc->ods[i - oc->first_dev]->od = od;
}

struct ore_striping_info {
--
1.7.2.3


2011-10-04 10:35:49

by Boaz Harrosh

[permalink] [raw]
Subject: [PATCH 15/19] pnfs-obj: Get rid of objlayout_{alloc,free}_io_state

This is part of moving objio_osd to use the ORE.

objlayout_io_state had two functions:
1. It was used in the error reporting mechanism at layout_return.
This function is kept intact.
(Later patch will rename objlayout_io_state => objlayout_io_res)
2. Carrier of rw io members into the objio_read/write_paglist API.
This is removed in this patch.

The {r,w}data received from NFS are passed directly to the
objio_{read,write}_paglist API. The io_engine is now allocating
it's own IO state as part of the read/write. The minimal
functionality that was part of the generic allocation is passed
to the io_engine.

So part of this patch is rename of:
ios->ol_state.foo => ios->foo

At objlayout_{read,write}_done an objlayout_io_state is passed that
denotes the result of the IO. (Hence the later name change).
If the IO is successful objlayout calls an objio_free_result() API
immediately (Which for objio_osd causes the release of the io_state).
If the IO ended in an error it is hanged onto until reported in
layout_return and is released later through the objio_free_result()
API. (All this is not new just renamed and cleaned)

Signed-off-by: Boaz Harrosh <[email protected]>
---
fs/nfs/objlayout/objio_osd.c | 94 ++++++++++++++++++++++----------
fs/nfs/objlayout/objlayout.c | 124 +++++++++++-------------------------------
fs/nfs/objlayout/objlayout.h | 36 ++++++-------
3 files changed, 112 insertions(+), 142 deletions(-)

diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 0c7c9ec..48eb91a 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -148,6 +148,13 @@ struct objio_state {
/* Generic layer */
struct objlayout_io_state ol_state;

+ struct page **pages;
+ unsigned pgbase;
+ unsigned nr_pages;
+ unsigned long count;
+ loff_t offset;
+ bool sync;
+
struct objio_segment *layout;

struct kref kref;
@@ -394,30 +401,43 @@ void objio_free_lseg(struct pnfs_layout_segment *lseg)
kfree(objio_seg);
}

-int objio_alloc_io_state(struct pnfs_layout_segment *lseg,
- struct objlayout_io_state **outp,
- gfp_t gfp_flags)
+static int
+objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
+ struct pnfs_layout_segment *lseg, struct page **pages, unsigned pgbase,
+ loff_t offset, size_t count, void *rpcdata, gfp_t gfp_flags,
+ struct objio_state **outp)
{
struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
struct objio_state *ios;
- const unsigned first_size = sizeof(*ios) +
- objio_seg->num_comps * sizeof(ios->per_dev[0]);
- const unsigned sec_size = objio_seg->num_comps *
- sizeof(ios->ol_state.ioerrs[0]);
-
- ios = kzalloc(first_size + sec_size, gfp_flags);
- if (unlikely(!ios))
+ struct __alloc_objio_state {
+ struct objio_state objios;
+ struct _objio_per_comp per_dev[objio_seg->num_comps];
+ struct pnfs_osd_ioerr ioerrs[objio_seg->num_comps];
+ } *aos;
+
+ aos = kzalloc(sizeof(*aos), gfp_flags);
+ if (unlikely(!aos))
return -ENOMEM;

- ios->layout = objio_seg;
- ios->ol_state.ioerrs = ((void *)ios) + first_size;
- ios->ol_state.num_comps = objio_seg->num_comps;
+ ios = &aos->objios;

- *outp = &ios->ol_state;
+ ios->layout = objio_seg;
+ objlayout_init_ioerrs(&aos->objios.ol_state, objio_seg->num_comps,
+ aos->ioerrs, rpcdata, pnfs_layout_type);
+
+ ios->pages = pages;
+ ios->pgbase = pgbase;
+ ios->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ ios->offset = offset;
+ ios->count = count;
+ ios->sync = 0;
+ BUG_ON(ios->nr_pages > (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT);
+
+ *outp = ios;
return 0;
}

-void objio_free_io_state(struct objlayout_io_state *ol_state)
+void objio_free_result(struct objlayout_io_state *ol_state)
{
struct objio_state *ios = container_of(ol_state, struct objio_state,
ol_state);
@@ -598,7 +618,7 @@ static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg,
if (per_dev->bio == NULL) {
unsigned pages_in_stripe = ios->layout->group_width *
(ios->layout->stripe_unit / PAGE_SIZE);
- unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
+ unsigned bio_size = (ios->nr_pages + pages_in_stripe) /
ios->layout->group_width;

if (BIO_MAX_PAGES_KMALLOC < bio_size)
@@ -615,11 +635,11 @@ static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg,
unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
unsigned added_len;

- BUG_ON(ios->ol_state.nr_pages <= pg);
+ BUG_ON(ios->nr_pages <= pg);
cur_len -= pglen;

added_len = bio_add_pc_page(q, per_dev->bio,
- ios->ol_state.pages[pg], pglen, pgbase);
+ ios->pages[pg], pglen, pgbase);
if (unlikely(pglen != added_len))
return -ENOMEM;
pgbase = 0;
@@ -660,7 +680,7 @@ static int _prepare_one_group(struct objio_state *ios, u64 length,
cur_len = stripe_unit - si->unit_off;
page_off = si->unit_off & ~PAGE_MASK;
BUG_ON(page_off &&
- (page_off != ios->ol_state.pgbase));
+ (page_off != ios->pgbase));
} else { /* dev > si->dev */
per_dev->offset = si->obj_offset - si->unit_off;
cur_len = stripe_unit;
@@ -693,8 +713,8 @@ out:

static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags)
{
- u64 length = ios->ol_state.count;
- u64 offset = ios->ol_state.offset;
+ u64 length = ios->count;
+ u64 offset = ios->offset;
struct _striping_info si;
unsigned last_pg = 0;
int ret = 0;
@@ -748,7 +768,7 @@ static int _io_exec(struct objio_state *ios)
int ret = 0;
unsigned i;
objio_done_fn saved_done_fn = ios->done;
- bool sync = ios->ol_state.sync;
+ bool sync = ios->sync;

if (sync) {
ios->done = _sync_done;
@@ -792,7 +812,7 @@ static int _read_done(struct objio_state *ios)
else
status = ret;

- objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync);
+ objlayout_read_done(&ios->ol_state, status, ios->sync);
return ret;
}

@@ -854,12 +874,18 @@ err:
return ret;
}

-int objio_read_pagelist(struct objlayout_io_state *ol_state)
+int objio_read_pagelist(struct nfs_read_data *rdata)
{
- struct objio_state *ios = container_of(ol_state, struct objio_state,
- ol_state);
+ struct objio_state *ios;
int ret;

+ ret = objio_alloc_io_state(NFS_I(rdata->inode)->layout,
+ rdata->lseg, rdata->args.pages, rdata->args.pgbase,
+ rdata->args.offset, rdata->args.count, rdata,
+ GFP_KERNEL, &ios);
+ if (unlikely(ret))
+ return ret;
+
ret = _io_rw_pagelist(ios, GFP_KERNEL);
if (unlikely(ret))
return ret;
@@ -886,7 +912,7 @@ static int _write_done(struct objio_state *ios)
status = ret;
}

- objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync);
+ objlayout_write_done(&ios->ol_state, status, ios->sync);
return ret;
}

@@ -976,12 +1002,20 @@ err:
return ret;
}

-int objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable)
+int objio_write_pagelist(struct nfs_write_data *wdata, int how)
{
- struct objio_state *ios = container_of(ol_state, struct objio_state,
- ol_state);
+ struct objio_state *ios;
int ret;

+ ret = objio_alloc_io_state(NFS_I(wdata->inode)->layout,
+ wdata->lseg, wdata->args.pages, wdata->args.pgbase,
+ wdata->args.offset, wdata->args.count, wdata, GFP_NOFS,
+ &ios);
+ if (unlikely(ret))
+ return ret;
+
+ ios->sync = 0 != (how & FLUSH_SYNC);
+
/* TODO: ios->stable = stable; */
ret = _io_rw_pagelist(ios, GFP_NOFS);
if (unlikely(ret))
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 99c807d..a82053a 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -156,59 +156,23 @@ last_byte_offset(u64 start, u64 len)
return end > start ? end - 1 : NFS4_MAX_UINT64;
}

-static struct objlayout_io_state *
-objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
- struct page **pages,
- unsigned pgbase,
- loff_t offset,
- size_t count,
- struct pnfs_layout_segment *lseg,
- void *rpcdata,
- gfp_t gfp_flags)
+void _fix_verify_io_params(struct pnfs_layout_segment *lseg,
+ struct page ***p_pages, unsigned *p_pgbase,
+ u64 offset, unsigned long count)
{
- struct objlayout_io_state *state;
u64 lseg_end_offset;

- dprintk("%s: allocating io_state\n", __func__);
- if (objio_alloc_io_state(lseg, &state, gfp_flags))
- return NULL;
-
BUG_ON(offset < lseg->pls_range.offset);
lseg_end_offset = end_offset(lseg->pls_range.offset,
lseg->pls_range.length);
BUG_ON(offset >= lseg_end_offset);
- if (offset + count > lseg_end_offset) {
- count = lseg->pls_range.length -
- (offset - lseg->pls_range.offset);
- dprintk("%s: truncated count %Zd\n", __func__, count);
- }
+ WARN_ON(offset + count > lseg_end_offset);

- if (pgbase > PAGE_SIZE) {
- pages += pgbase >> PAGE_SHIFT;
- pgbase &= ~PAGE_MASK;
+ if (*p_pgbase > PAGE_SIZE) {
+ dprintk("%s: pgbase(0x%x) > PAGE_SIZE\n", __func__, *p_pgbase);
+ *p_pages += *p_pgbase >> PAGE_SHIFT;
+ *p_pgbase &= ~PAGE_MASK;
}
-
- INIT_LIST_HEAD(&state->err_list);
- state->lseg = lseg;
- state->rpcdata = rpcdata;
- state->pages = pages;
- state->pgbase = pgbase;
- state->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT;
- state->offset = offset;
- state->count = count;
- state->sync = 0;
-
- return state;
-}
-
-static void
-objlayout_free_io_state(struct objlayout_io_state *state)
-{
- dprintk("%s: freeing io_state\n", __func__);
- if (unlikely(!state))
- return;
-
- objio_free_io_state(state);
}

/*
@@ -217,12 +181,10 @@ objlayout_free_io_state(struct objlayout_io_state *state)
static void
objlayout_iodone(struct objlayout_io_state *state)
{
- dprintk("%s: state %p status\n", __func__, state);
-
if (likely(state->status >= 0)) {
- objlayout_free_io_state(state);
+ objio_free_result(state);
} else {
- struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
+ struct objlayout *objlay = state->objlay;

spin_lock(&objlay->lock);
objlay->delta_space_valid = OBJ_DSU_INVALID;
@@ -289,15 +251,15 @@ objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync)
{
struct nfs_read_data *rdata = state->rpcdata;

- state->status = status;
- dprintk("%s: Begin status=%zd eof=%d\n", __func__,
- status, rdata->res.eof);
- rdata->task.tk_status = status;
+ state->status = rdata->task.tk_status = status;
if (status >= 0)
rdata->res.count = status;
objlayout_iodone(state);
/* must not use state after this point */

+ dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__,
+ status, rdata->res.eof, sync);
+
if (sync)
pnfs_ld_read_done(rdata);
else {
@@ -314,7 +276,6 @@ objlayout_read_pagelist(struct nfs_read_data *rdata)
{
loff_t offset = rdata->args.offset;
size_t count = rdata->args.count;
- struct objlayout_io_state *state;
int err;
loff_t eof;

@@ -331,20 +292,14 @@ objlayout_read_pagelist(struct nfs_read_data *rdata)
}

rdata->res.eof = (offset + count) >= eof;
+ _fix_verify_io_params(rdata->lseg, &rdata->args.pages,
+ &rdata->args.pgbase,
+ rdata->args.offset, rdata->args.count);

- state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout,
- rdata->args.pages, rdata->args.pgbase,
- offset, count,
- rdata->lseg, rdata,
- GFP_KERNEL);
- if (unlikely(!state)) {
- err = -ENOMEM;
- goto out;
- }
dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n",
__func__, rdata->inode->i_ino, offset, count, rdata->res.eof);

- err = objio_read_pagelist(state);
+ err = objio_read_pagelist(rdata);
out:
if (unlikely(err)) {
rdata->pnfs_error = err;
@@ -374,23 +329,18 @@ void
objlayout_write_done(struct objlayout_io_state *state, ssize_t status,
bool sync)
{
- struct nfs_write_data *wdata;
+ struct nfs_write_data *wdata = state->rpcdata;

- dprintk("%s: Begin\n", __func__);
- wdata = state->rpcdata;
- state->status = status;
- wdata->task.tk_status = status;
+ state->status = wdata->task.tk_status = status;
if (status >= 0) {
wdata->res.count = status;
wdata->verf.committed = state->committed;
- dprintk("%s: Return status %d committed %d\n",
- __func__, wdata->task.tk_status,
- wdata->verf.committed);
- } else
- dprintk("%s: Return status %d\n",
- __func__, wdata->task.tk_status);
+ }
objlayout_iodone(state);
- /* must not use state after this point */
+ /* must not use oir after this point */
+
+ dprintk("%s: Return status %zd committed %d sync=%d\n", __func__,
+ status, wdata->verf.committed, sync);

if (sync)
pnfs_ld_write_done(wdata);
@@ -407,25 +357,13 @@ enum pnfs_try_status
objlayout_write_pagelist(struct nfs_write_data *wdata,
int how)
{
- struct objlayout_io_state *state;
int err;

- state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout,
- wdata->args.pages,
- wdata->args.pgbase,
- wdata->args.offset,
- wdata->args.count,
- wdata->lseg, wdata,
- GFP_NOFS);
- if (unlikely(!state)) {
- err = -ENOMEM;
- goto out;
- }
+ _fix_verify_io_params(wdata->lseg, &wdata->args.pages,
+ &wdata->args.pgbase,
+ wdata->args.offset, wdata->args.count);

- state->sync = how & FLUSH_SYNC;
-
- err = objio_write_pagelist(state, how & FLUSH_STABLE);
- out:
+ err = objio_write_pagelist(wdata, how);
if (unlikely(err)) {
wdata->pnfs_error = err;
dprintk("%s: Returned Error %d\n", __func__, err);
@@ -564,7 +502,7 @@ encode_accumulated_error(struct objlayout *objlay, __be32 *p)
merge_ioerr(&accumulated_err, ioerr);
}
list_del(&state->err_list);
- objlayout_free_io_state(state);
+ objio_free_result(state);
}

pnfs_osd_xdr_encode_ioerr(p, &accumulated_err);
@@ -632,7 +570,7 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
goto loop_done;
}
list_del(&state->err_list);
- objlayout_free_io_state(state);
+ objio_free_result(state);
}
loop_done:
spin_unlock(&objlay->lock);
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index 4edac9b..d7b2ccfa 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -75,14 +75,7 @@ OBJLAYOUT(struct pnfs_layout_hdr *lo)
* embedded in objects provider io_state data structure
*/
struct objlayout_io_state {
- struct pnfs_layout_segment *lseg;
-
- struct page **pages;
- unsigned pgbase;
- unsigned nr_pages;
- unsigned long count;
- loff_t offset;
- bool sync;
+ struct objlayout *objlay;

void *rpcdata;
int status; /* res */
@@ -99,6 +92,18 @@ struct objlayout_io_state {
struct pnfs_osd_ioerr *ioerrs;
};

+static inline
+void objlayout_init_ioerrs(struct objlayout_io_state *oir, unsigned num_comps,
+ struct pnfs_osd_ioerr *ioerrs, void *rpcdata,
+ struct pnfs_layout_hdr *pnfs_layout_type)
+{
+ oir->objlay = OBJLAYOUT(pnfs_layout_type);
+ oir->rpcdata = rpcdata;
+ INIT_LIST_HEAD(&oir->err_list);
+ oir->num_comps = num_comps;
+ oir->ioerrs = ioerrs;
+}
+
/*
* Raid engine I/O API
*/
@@ -109,15 +114,10 @@ extern int objio_alloc_lseg(struct pnfs_layout_segment **outp,
gfp_t gfp_flags);
extern void objio_free_lseg(struct pnfs_layout_segment *lseg);

-extern int objio_alloc_io_state(
- struct pnfs_layout_segment *lseg,
- struct objlayout_io_state **outp,
- gfp_t gfp_flags);
-extern void objio_free_io_state(struct objlayout_io_state *state);
+extern void objio_free_result(struct objlayout_io_state *state);

-extern int objio_read_pagelist(struct objlayout_io_state *ol_state);
-extern int objio_write_pagelist(struct objlayout_io_state *ol_state,
- bool stable);
+extern int objio_read_pagelist(struct nfs_read_data *rdata);
+extern int objio_write_pagelist(struct nfs_write_data *wdata, int how);

/*
* callback API
@@ -127,10 +127,8 @@ extern void objlayout_io_set_result(struct objlayout_io_state *state,
int osd_error, u64 offset, u64 length, bool is_write);

static inline void
-objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used)
+objlayout_add_delta_space_used(struct objlayout *objlay, s64 space_used)
{
- struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
-
/* If one of the I/Os errored out and the delta_space_used was
* invalid we render the complete report as invalid. Protocol mandate
* the DSU be accurate or not reported.
--
1.7.2.3


2011-10-04 10:37:01

by Boaz Harrosh

[permalink] [raw]
Subject: [PATCH 18/19] pnfs-obj: move to ore 02: move to ORE

In this patch we are actually moving to the ORE.
(Object Raid Engine).

objio_state holds a pointer to an ore_io_state. Once
we have an ore_io_state at hand we can call the ore
for reading/writting. We register on the done path
to kick off the nfs io_done mechanism.

Again for Ease of reviewing the old code is "#if 0"
but is not removed so the diff command works better.
The old code will be removed in the next patch.

Signed-off-by: Boaz Harrosh <[email protected]>
---
fs/nfs/objlayout/objio_osd.c | 133 +++++++++++++++++++-----------------------
1 files changed, 59 insertions(+), 74 deletions(-)

diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index bd7ec26..00b3849 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -44,12 +44,6 @@

#define NFSDBG_FACILITY NFSDBG_PNFS_LD

-#define _LLU(x) ((unsigned long long)x)
-
-enum { BIO_MAX_PAGES_KMALLOC =
- (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
-};
-
struct objio_dev_ent {
struct nfs4_deviceid_node id_node;
struct ore_dev od;
@@ -124,37 +118,13 @@ OBJIO_LSEG(struct pnfs_layout_segment *lseg)
return container_of(lseg, struct objio_segment, lseg);
}

-struct objio_state;
-typedef int (*objio_done_fn)(struct objio_state *ios);
-
struct objio_state {
/* Generic layer */
struct objlayout_io_res oir;

- struct page **pages;
- unsigned pgbase;
- unsigned nr_pages;
- unsigned long count;
- loff_t offset;
bool sync;
-
- struct ore_layout *layout;
- struct ore_components *oc;
-
- struct kref kref;
- objio_done_fn done;
- void *private;
-
- unsigned long length;
- unsigned numdevs; /* Actually used devs in this IO */
- /* A per-device variable array of size numdevs */
- struct _objio_per_comp {
- struct bio *bio;
- struct osd_request *or;
- unsigned long length;
- u64 offset;
- unsigned dev;
- } per_dev[];
+ /*FIXME: Support for extra_bytes at ore_get_rw_state() */
+ struct ore_io_state *ios;
};

/* Send and wait for a get_device_info of devices in the layout,
@@ -374,16 +344,16 @@ void objio_free_lseg(struct pnfs_layout_segment *lseg)
}

static int
-objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
+objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, bool is_reading,
struct pnfs_layout_segment *lseg, struct page **pages, unsigned pgbase,
loff_t offset, size_t count, void *rpcdata, gfp_t gfp_flags,
struct objio_state **outp)
{
struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
- struct objio_state *ios;
+ struct ore_io_state *ios;
+ int ret;
struct __alloc_objio_state {
struct objio_state objios;
- struct _objio_per_comp per_dev[objio_seg->oc.numdevs];
struct pnfs_osd_ioerr ioerrs[objio_seg->oc.numdevs];
} *aos;

@@ -391,30 +361,33 @@ objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
if (unlikely(!aos))
return -ENOMEM;

- ios = &aos->objios;
-
- ios->layout = &objio_seg->layout;
- ios->oc = &objio_seg->oc;
objlayout_init_ioerrs(&aos->objios.oir, objio_seg->oc.numdevs,
aos->ioerrs, rpcdata, pnfs_layout_type);

+ ret = ore_get_rw_state(&objio_seg->layout, &objio_seg->oc, is_reading,
+ offset, count, &ios);
+ if (unlikely(ret)) {
+ kfree(aos);
+ return ret;
+ }
+
ios->pages = pages;
ios->pgbase = pgbase;
- ios->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT;
- ios->offset = offset;
- ios->count = count;
- ios->sync = 0;
+ ios->private = aos;
BUG_ON(ios->nr_pages > (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT);

- *outp = ios;
+ aos->objios.sync = 0;
+ aos->objios.ios = ios;
+ *outp = &aos->objios;
return 0;
}

void objio_free_result(struct objlayout_io_res *oir)
{
- struct objio_state *ios = container_of(oir, struct objio_state, oir);
+ struct objio_state *objios = container_of(oir, struct objio_state, oir);

- kfree(ios);
+ ore_put_io_state(objios->ios);
+ kfree(objios);
}

enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
@@ -447,7 +420,7 @@ enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
}
}

-static void __on_dev_error(struct objio_state *ios, bool is_write,
+static void __on_dev_error(struct ore_io_state *ios,
struct ore_dev *od, unsigned dev_index, enum osd_err_priority oep,
u64 dev_offset, u64 dev_len)
{
@@ -465,9 +438,10 @@ static void __on_dev_error(struct objio_state *ios, bool is_write,

objlayout_io_set_result(&objios->oir, comp,
&pooid, osd_pri_2_pnfs_err(oep),
- dev_offset, dev_len, is_write);
+ dev_offset, dev_len, !ios->reading);
}

+#if 0
static void _clear_bio(struct bio *bio)
{
struct bio_vec *bv;
@@ -786,26 +760,28 @@ static int _io_exec(struct objio_state *ios)

return ret;
}
+#endif

/*
* read
*/
-static int _read_done(struct objio_state *ios)
+static void _read_done(struct ore_io_state *ios, void *private)
{
+ struct objio_state *objios = private;
ssize_t status;
- int ret = _io_check(ios, false);
+ int ret = ore_check_io(ios, &__on_dev_error);

- _io_free(ios);
+ /* FIXME: _io_free(ios) can we dealocate the libosd resources; */

if (likely(!ret))
status = ios->length;
else
status = ret;

- objlayout_read_done(&ios->oir, status, ios->sync);
- return ret;
+ objlayout_read_done(&objios->oir, status, objios->sync);
}

+#if 0
static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
{
struct osd_request *or = NULL;
@@ -860,49 +836,50 @@ err:
_io_free(ios);
return ret;
}
+#endif

int objio_read_pagelist(struct nfs_read_data *rdata)
{
- struct objio_state *ios;
+ struct objio_state *objios;
int ret;

- ret = objio_alloc_io_state(NFS_I(rdata->inode)->layout,
+ ret = objio_alloc_io_state(NFS_I(rdata->inode)->layout, true,
rdata->lseg, rdata->args.pages, rdata->args.pgbase,
rdata->args.offset, rdata->args.count, rdata,
- GFP_KERNEL, &ios);
- if (unlikely(ret))
- return ret;
-
- ret = _io_rw_pagelist(ios, GFP_KERNEL);
+ GFP_KERNEL, &objios);
if (unlikely(ret))
return ret;

- return _read_exec(ios);
+ objios->ios->done = _read_done;
+ dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
+ rdata->args.offset, rdata->args.count);
+ return ore_read(objios->ios);
}

/*
* write
*/
-static int _write_done(struct objio_state *ios)
+static void _write_done(struct ore_io_state *ios, void *private)
{
+ struct objio_state *objios = private;
ssize_t status;
- int ret = _io_check(ios, true);
+ int ret = ore_check_io(ios, &__on_dev_error);

- _io_free(ios);
+ /* FIXME: _io_free(ios) can we dealocate the libosd resources; */

if (likely(!ret)) {
/* FIXME: should be based on the OSD's persistence model
* See OSD2r05 Section 4.13 Data persistence model */
- ios->oir.committed = NFS_FILE_SYNC;
+ objios->oir.committed = NFS_FILE_SYNC;
status = ios->length;
} else {
status = ret;
}

- objlayout_write_done(&ios->oir, status, ios->sync);
- return ret;
+ objlayout_write_done(&objios->oir, status, objios->sync);
}

+#if 0
static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
{
struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp];
@@ -984,27 +961,35 @@ err:
_io_free(ios);
return ret;
}
+#endif

int objio_write_pagelist(struct nfs_write_data *wdata, int how)
{
- struct objio_state *ios;
+ struct objio_state *objios;
int ret;

- ret = objio_alloc_io_state(NFS_I(wdata->inode)->layout,
+ ret = objio_alloc_io_state(NFS_I(wdata->inode)->layout, false,
wdata->lseg, wdata->args.pages, wdata->args.pgbase,
wdata->args.offset, wdata->args.count, wdata, GFP_NOFS,
- &ios);
+ &objios);
if (unlikely(ret))
return ret;

- ios->sync = 0 != (how & FLUSH_SYNC);
+ objios->sync = 0 != (how & FLUSH_SYNC);

- /* TODO: ios->stable = stable; */
- ret = _io_rw_pagelist(ios, GFP_NOFS);
+ if (!objios->sync)
+ objios->ios->done = _write_done;
+
+ dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
+ wdata->args.offset, wdata->args.count);
+ ret = ore_write(objios->ios);
if (unlikely(ret))
return ret;

- return _write_exec(ios);
+ if (objios->sync)
+ _write_done(objios->ios, objios);
+
+ return 0;
}

static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
--
1.7.2.3