2006-03-08 19:45:20

by Dan Aloni

[permalink] [raw]
Subject: [PATCH] sata_mv: stabilize for 5081 and other fixes

Hello,

With the patch below I've managed to stabilize the sata_mv driver
running a Marvell 5081 SATA controller. Prior to these changes it
would cause sporadic memory corruptions right after insmod. I prefer
this driver over Marvell's own driver which have all sorts of weird
bugs.

The patch also increases the maximum possible number of SG entries
per IO to 256 (this large sg_tablesize is a requirement for the
application we are developing at my company, XIV). Notice that it
also zeros-out a reserved field in the SG, I'm not sure how it
affected stability but something did. I've also added the 'volatile'
qualifier to some fields that belong to structs involved with I/O.

This patch is for 2.6.16-rc5.

Signed-off-by: Dan Aloni <[email protected]>

--- drivers/scsi/sata_mv.c 2006-03-08 11:30:10.000000000 +0200
+++ drivers/scsi/sata_mv.c 2006-03-08 20:59:55.000000000 +0200
@@ -72,9 +72,10 @@
*/
MV_CRQB_Q_SZ = (32 * MV_MAX_Q_DEPTH),
MV_CRPB_Q_SZ = (8 * MV_MAX_Q_DEPTH),
- MV_MAX_SG_CT = 176,
+ MV_MAX_SG_CT = 256,
MV_SG_TBL_SZ = (16 * MV_MAX_SG_CT),
- MV_PORT_PRIV_DMA_SZ = (MV_CRQB_Q_SZ + MV_CRPB_Q_SZ + MV_SG_TBL_SZ),
+ MV_PORT_PRIV_DMA1_SZ = (MV_CRQB_Q_SZ + MV_CRPB_Q_SZ),
+ MV_PORT_PRIV_DMA2_SZ = (MV_SG_TBL_SZ),

MV_PORTS_PER_HC = 4,
/* == (port / MV_PORTS_PER_HC) to determine HC from 0-7 port */
@@ -98,7 +99,7 @@

CRPB_FLAG_STATUS_SHIFT = 8,

- EPRD_FLAG_END_OF_TBL = (1 << 31),
+ EPRD_FLAG_END_OF_TBL = (1 << 15),

/* PCI interface registers */

@@ -257,29 +258,34 @@
chip_608x,
};

+#pragma pack(1)
+
/* Command ReQuest Block: 32B */
struct mv_crqb {
- u32 sg_addr;
- u32 sg_addr_hi;
- u16 ctrl_flags;
+ volatile u32 sg_addr;
+ volatile u32 sg_addr_hi;
+ volatile u16 ctrl_flags;
u16 ata_cmd[11];
};

/* Command ResPonse Block: 8B */
struct mv_crpb {
- u16 id;
- u16 flags;
- u32 tmstmp;
+ volatile u16 id;
+ volatile u16 flags;
+ volatile u32 tmstmp;
};

/* EDMA Physical Region Descriptor (ePRD); A.K.A. SG */
struct mv_sg {
- u32 addr;
- u32 flags_size;
- u32 addr_hi;
- u32 reserved;
+ volatile u32 addr;
+ volatile u16 size;
+ volatile u16 flags;
+ volatile u32 addr_hi;
+ volatile u32 reserved;
};

+#pragma pack(0)
+
struct mv_port_priv {
struct mv_crqb *crqb;
dma_addr_t crqb_dma;
@@ -294,8 +300,8 @@
};

struct mv_port_signal {
- u32 amps;
- u32 pre;
+ volatile u32 amps;
+ volatile u32 pre;
};

struct mv_host_priv;
@@ -365,7 +371,7 @@
.eh_strategy_handler = ata_scsi_error,
.can_queue = MV_USE_Q_DEPTH,
.this_id = ATA_SHT_THIS_ID,
- .sg_tablesize = MV_MAX_SG_CT / 2,
+ .sg_tablesize = MV_MAX_SG_CT,
.max_sectors = ATA_MAX_SECTORS,
.cmd_per_lun = ATA_SHT_CMD_PER_LUN,
.emulated = ATA_SHT_EMULATED,
@@ -509,10 +515,7 @@
.reset_bus = mv_reset_pci_bus,
};

-/*
- * module options
- */
-static int msi; /* Use PCI msi; either zero (off, default) or non-zero */
+static int msi; /* Use PCI msi; either zero (off, default) or non-zero */


/*
@@ -770,7 +773,8 @@

static inline void mv_priv_free(struct mv_port_priv *pp, struct device *dev)
{
- dma_free_coherent(dev, MV_PORT_PRIV_DMA_SZ, pp->crpb, pp->crpb_dma);
+ dma_free_coherent(dev, MV_PORT_PRIV_DMA1_SZ, pp->crpb, pp->crpb_dma);
+ dma_free_coherent(dev, MV_PORT_PRIV_DMA2_SZ, pp->sg_tbl, pp->sg_tbl_dma);
}

/**
@@ -788,8 +792,8 @@
struct device *dev = ap->host_set->dev;
struct mv_port_priv *pp;
void __iomem *port_mmio = mv_ap_base(ap);
- void *mem;
- dma_addr_t mem_dma;
+ void *mem, *mem2;
+ dma_addr_t mem_dma, mem_dma2;
int rc = -ENOMEM;

pp = kmalloc(sizeof(*pp), GFP_KERNEL);
@@ -797,11 +801,17 @@
goto err_out;
memset(pp, 0, sizeof(*pp));

- mem = dma_alloc_coherent(dev, MV_PORT_PRIV_DMA_SZ, &mem_dma,
+ mem = dma_alloc_coherent(dev, MV_PORT_PRIV_DMA1_SZ, &mem_dma,
GFP_KERNEL);
if (!mem)
goto err_out_pp;
- memset(mem, 0, MV_PORT_PRIV_DMA_SZ);
+ memset(mem, 0, MV_PORT_PRIV_DMA1_SZ);
+
+ mem2 = dma_alloc_coherent(dev, MV_PORT_PRIV_DMA2_SZ, &mem_dma2,
+ GFP_KERNEL);
+ if (!mem2)
+ goto err_out_pp_2;
+ memset(mem2, 0, MV_PORT_PRIV_DMA2_SZ);

rc = ata_pad_alloc(ap, dev);
if (rc)
@@ -820,14 +830,12 @@
*/
pp->crpb = mem;
pp->crpb_dma = mem_dma;
- mem += MV_CRPB_Q_SZ;
- mem_dma += MV_CRPB_Q_SZ;

/* Third item:
* Table of scatter-gather descriptors (ePRD), 16 bytes each
*/
- pp->sg_tbl = mem;
- pp->sg_tbl_dma = mem_dma;
+ pp->sg_tbl = mem2;
+ pp->sg_tbl_dma = mem_dma2;

writelfl(EDMA_CFG_Q_DEPTH | EDMA_CFG_RD_BRST_EXT |
EDMA_CFG_WR_BUFF_LEN, port_mmio + EDMA_CFG_OFS);
@@ -853,7 +861,9 @@
return 0;

err_out_priv:
- mv_priv_free(pp, dev);
+ dma_free_coherent(dev, MV_PORT_PRIV_DMA2_SZ, pp->sg_tbl, pp->sg_tbl_dma);
+err_out_pp_2:
+ dma_free_coherent(dev, MV_PORT_PRIV_DMA1_SZ, pp->crpb, pp->crpb_dma);
err_out_pp:
kfree(pp);
err_out:
@@ -915,13 +925,15 @@

pp->sg_tbl[i].addr = cpu_to_le32(addr & 0xffffffff);
pp->sg_tbl[i].addr_hi = cpu_to_le32((addr >> 16) >> 16);
- pp->sg_tbl[i].flags_size = cpu_to_le32(len);
+ pp->sg_tbl[i].flags = cpu_to_le16(0);
+ pp->sg_tbl[i].size = cpu_to_le16(len);
+ pp->sg_tbl[i].reserved = 0;

sg_len -= len;
addr += len;

if (!sg_len && ata_sg_is_last(sg, qc))
- pp->sg_tbl[i].flags_size |= cpu_to_le32(EPRD_FLAG_END_OF_TBL);
+ pp->sg_tbl[i].flags |= cpu_to_le16(EPRD_FLAG_END_OF_TBL);

i++;
}


--
Dan Aloni
[email protected], [email protected], [email protected], [email protected]


2006-03-08 20:27:24

by John Stoffel

[permalink] [raw]
Subject: Re: [PATCH] sata_mv: stabilize for 5081 and other fixes

>>>>> "Dan" == Dan Aloni <[email protected]> writes:

Dan> With the patch below I've managed to stabilize the sata_mv driver
Dan> running a Marvell 5081 SATA controller. Prior to these changes it
Dan> would cause sporadic memory corruptions right after insmod. I prefer
Dan> this driver over Marvell's own driver which have all sorts of weird
Dan> bugs.

Cool! Now I can think about SATA more seriously. Now for some
comments on the patch itself, from a non-kernel hacker, so feel free
to blow me off.

Dan> --- drivers/scsi/sata_mv.c 2006-03-08 11:30:10.000000000 +0200
Dan> +++ drivers/scsi/sata_mv.c 2006-03-08 20:59:55.000000000 +0200
Dan> @@ -72,9 +72,10 @@
Dan> */
Dan> MV_CRQB_Q_SZ = (32 * MV_MAX_Q_DEPTH),
Dan> MV_CRPB_Q_SZ = (8 * MV_MAX_Q_DEPTH),
Dan> - MV_MAX_SG_CT = 176,
Dan> + MV_MAX_SG_CT = 256,
Dan> MV_SG_TBL_SZ = (16 * MV_MAX_SG_CT),
Dan> - MV_PORT_PRIV_DMA_SZ = (MV_CRQB_Q_SZ + MV_CRPB_Q_SZ + MV_SG_TBL_SZ),
Dan> + MV_PORT_PRIV_DMA1_SZ = (MV_CRQB_Q_SZ + MV_CRPB_Q_SZ),
Dan> + MV_PORT_PRIV_DMA2_SZ = (MV_SG_TBL_SZ),

I don't like these names for the reagions. DMA1 and DMA2 mean
nothing. Should be be something a little more descriptive? And a
comment why you split them up would be good too.

Dan> +#pragma pack(1)

I don't see these used anywhere else in the kernel, they should
probably go.

Dan> +
Dan> /* Command ReQuest Block: 32B */
Dan> struct mv_crqb {
Dan> - u32 sg_addr;
Dan> - u32 sg_addr_hi;
Dan> - u16 ctrl_flags;
Dan> + volatile u32 sg_addr;
Dan> + volatile u32 sg_addr_hi;
Dan> + volatile u16 ctrl_flags;
Dan> u16 ata_cmd[11];
Dan> };

Dan> /* Command ResPonse Block: 8B */
Dan> struct mv_crpb {
Dan> - u16 id;
Dan> - u16 flags;
Dan> - u32 tmstmp;
Dan> + volatile u16 id;
Dan> + volatile u16 flags;
Dan> + volatile u32 tmstmp;
Dan> };

Dan> /* EDMA Physical Region Descriptor (ePRD); A.K.A. SG */
Dan> struct mv_sg {
Dan> - u32 addr;
Dan> - u32 flags_size;
Dan> - u32 addr_hi;
Dan> - u32 reserved;
Dan> + volatile u32 addr;
Dan> + volatile u16 size;
Dan> + volatile u16 flags;
Dan> + volatile u32 addr_hi;
Dan> + volatile u32 reserved;
Dan> };

Dan> +#pragma pack(0)
Dan> +
Dan> struct mv_port_priv {
Dan> struct mv_crqb *crqb;
Dan> dma_addr_t crqb_dma;
Dan> @@ -294,8 +300,8 @@
Dan> };

Dan> struct mv_port_signal {
Dan> - u32 amps;
Dan> - u32 pre;
Dan> + volatile u32 amps;
Dan> + volatile u32 pre;
Dan> };

Dan> struct mv_host_priv;
Dan> @@ -365,7 +371,7 @@
Dan> .eh_strategy_handler = ata_scsi_error,
Dan> .can_queue = MV_USE_Q_DEPTH,
Dan> .this_id = ATA_SHT_THIS_ID,
Dan> - .sg_tablesize = MV_MAX_SG_CT / 2,
Dan> + .sg_tablesize = MV_MAX_SG_CT,
Dan> .max_sectors = ATA_MAX_SECTORS,
Dan> .cmd_per_lun = ATA_SHT_CMD_PER_LUN,
Dan> .emulated = ATA_SHT_EMULATED,
Dan> @@ -509,10 +515,7 @@
Dan> .reset_bus = mv_reset_pci_bus,
Dan> };

Dan> -/*
Dan> - * module options
Dan> - */
Dan> -static int msi; /* Use PCI msi; either zero (off, default) or non-zero */
Dan> +static int msi; /* Use PCI msi; either zero (off, default) or non-zero */


Dan> /*
Dan> @@ -770,7 +773,8 @@

Dan> static inline void mv_priv_free(struct mv_port_priv *pp, struct device *dev)
Dan> {
Dan> - dma_free_coherent(dev, MV_PORT_PRIV_DMA_SZ, pp->crpb, pp->crpb_dma);
Dan> + dma_free_coherent(dev, MV_PORT_PRIV_DMA1_SZ, pp->crpb, pp->crpb_dma);
Dan> + dma_free_coherent(dev, MV_PORT_PRIV_DMA2_SZ, pp->sg_tbl, pp->sg_tbl_dma);
Dan> }

Could you name these something like MV_PORT_CRPB_SZ and
MV_PORT_SGTBL_SZ instead? Seems to make more sense here, and shows
exactly what they refer to.


Dan> /**
Dan> @@ -788,8 +792,8 @@
Dan> struct device *dev = ap->host_set->dev;
Dan> struct mv_port_priv *pp;
Dan> void __iomem *port_mmio = mv_ap_base(ap);
Dan> - void *mem;
Dan> - dma_addr_t mem_dma;
Dan> + void *mem, *mem2;
Dan> + dma_addr_t mem_dma, mem_dma2;
Dan> int rc = -ENOMEM;

Dan> pp = kmalloc(sizeof(*pp), GFP_KERNEL);
Dan> @@ -797,11 +801,17 @@
Dan> goto err_out;
Dan> memset(pp, 0, sizeof(*pp));

Dan> - mem = dma_alloc_coherent(dev, MV_PORT_PRIV_DMA_SZ, &mem_dma,
Dan> + mem = dma_alloc_coherent(dev, MV_PORT_PRIV_DMA1_SZ, &mem_dma,
Dan> GFP_KERNEL);
Dan> if (!mem)
Dan> goto err_out_pp;
Dan> - memset(mem, 0, MV_PORT_PRIV_DMA_SZ);
Dan> + memset(mem, 0, MV_PORT_PRIV_DMA1_SZ);
Dan> +
Dan> + mem2 = dma_alloc_coherent(dev, MV_PORT_PRIV_DMA2_SZ, &mem_dma2,
Dan> + GFP_KERNEL);
Dan> + if (!mem2)
Dan> + goto err_out_pp_2;
Dan> + memset(mem2, 0, MV_PORT_PRIV_DMA2_SZ);

Dan> rc = ata_pad_alloc(ap, dev);
Dan> if (rc)
Dan> @@ -820,14 +830,12 @@
Dan> */
pp-> crpb = mem;
pp-> crpb_dma = mem_dma;
Dan> - mem += MV_CRPB_Q_SZ;
Dan> - mem_dma += MV_CRPB_Q_SZ;

Dan> /* Third item:
Dan> * Table of scatter-gather descriptors (ePRD), 16 bytes each
Dan> */
Dan> - pp->sg_tbl = mem;
Dan> - pp->sg_tbl_dma = mem_dma;
Dan> + pp->sg_tbl = mem2;
Dan> + pp->sg_tbl_dma = mem_dma2;

Dan> writelfl(EDMA_CFG_Q_DEPTH | EDMA_CFG_RD_BRST_EXT |
Dan> EDMA_CFG_WR_BUFF_LEN, port_mmio + EDMA_CFG_OFS);
Dan> @@ -853,7 +861,9 @@
Dan> return 0;

Dan> err_out_priv:
Dan> - mv_priv_free(pp, dev);
Dan> + dma_free_coherent(dev, MV_PORT_PRIV_DMA2_SZ, pp->sg_tbl, pp->sg_tbl_dma);
Dan> +err_out_pp_2:
Dan> + dma_free_coherent(dev, MV_PORT_PRIV_DMA1_SZ, pp->crpb, pp->crpb_dma);
Dan> err_out_pp:
Dan> kfree(pp);
Dan> err_out:
Dan> @@ -915,13 +925,15 @@

pp-> sg_tbl[i].addr = cpu_to_le32(addr & 0xffffffff);
pp-> sg_tbl[i].addr_hi = cpu_to_le32((addr >> 16) >> 16);
Dan> - pp->sg_tbl[i].flags_size = cpu_to_le32(len);
Dan> + pp->sg_tbl[i].flags = cpu_to_le16(0);
Dan> + pp->sg_tbl[i].size = cpu_to_le16(len);
Dan> + pp->sg_tbl[i].reserved = 0;

Dan> sg_len -= len;
Dan> addr += len;

Dan> if (!sg_len && ata_sg_is_last(sg, qc))
Dan> - pp->sg_tbl[i].flags_size |= cpu_to_le32(EPRD_FLAG_END_OF_TBL);
Dan> + pp->sg_tbl[i].flags |= cpu_to_le16(EPRD_FLAG_END_OF_TBL);

Dan> i++;
Dan> }

John

2006-03-09 06:36:09

by Dan Aloni

[permalink] [raw]
Subject: Re: [PATCH] sata_mv: stabilize for 5081 and other fixes

> Dan> - MV_MAX_SG_CT = 176,
> Dan> + MV_MAX_SG_CT = 256,
> Dan> MV_SG_TBL_SZ = (16 * MV_MAX_SG_CT),
> Dan> - MV_PORT_PRIV_DMA_SZ = (MV_CRQB_Q_SZ + MV_CRPB_Q_SZ + MV_SG_TBL_SZ),
> Dan> + MV_PORT_PRIV_DMA1_SZ = (MV_CRQB_Q_SZ + MV_CRPB_Q_SZ),
> Dan> + MV_PORT_PRIV_DMA2_SZ = (MV_SG_TBL_SZ),
>
> I don't like these names for the reagions. DMA1 and DMA2 mean
> nothing. Should be be something a little more descriptive? And a
> comment why you split them up would be good too.

I admit I don't like these names either - however I didn't dedicate
enough time for this patch because I'd like the maintainer(s) to
give notes about my changes unrelated to coding conventions, but
thanks anyway.

If this patch is Right (tm) then I'll repost a better-looking version
of it or let the maintainer(s) implement in the way they see fit.

> Dan> +#pragma pack(1)
>
> I don't see these used anywhere else in the kernel, they should
> probably go.

It might not affect the packing anyway in this case, but I was
paranoid when I tried to fix the driver.

> Dan> -/*
> Dan> - * module options
> Dan> - */
> Dan> -static int msi; /* Use PCI msi; either zero (off, default) or non-zero */
> Dan> +static int msi; /* Use PCI msi; either zero (off, default) or non-zero */

Whoops, shouldn't have removed the comment. I guess I'll re-post
anyway.

--
Dan Aloni
[email protected], [email protected], [email protected], [email protected]

2006-03-12 01:45:34

by Jeff Garzik

[permalink] [raw]
Subject: Re: [PATCH] sata_mv: stabilize for 5081 and other fixes

Dan Aloni wrote:
> Hello,
>
> With the patch below I've managed to stabilize the sata_mv driver
> running a Marvell 5081 SATA controller. Prior to these changes it
> would cause sporadic memory corruptions right after insmod. I prefer
> this driver over Marvell's own driver which have all sorts of weird
> bugs.
>
> The patch also increases the maximum possible number of SG entries
> per IO to 256 (this large sg_tablesize is a requirement for the
> application we are developing at my company, XIV). Notice that it
> also zeros-out a reserved field in the SG, I'm not sure how it
> affected stability but something did. I've also added the 'volatile'
> qualifier to some fields that belong to structs involved with I/O.

Adding 'volatile' is to be avoided. This is simply hiding bugs
elsewhere. volatile is an "atom bomb" that kills quite valid
optimizations, needlessly. Most likely you need to sprinkle rmb(),
wmb(), and/or mb() in the correct locations.

Also, it isn't clear at all from your description precisely which
changes are fixes, and which are cleanups and optimizations. It would
be nice to get each category of changes split into two separate patches.



> --- drivers/scsi/sata_mv.c 2006-03-08 11:30:10.000000000 +0200
> +++ drivers/scsi/sata_mv.c 2006-03-08 20:59:55.000000000 +0200
> @@ -72,9 +72,10 @@
> */
> MV_CRQB_Q_SZ = (32 * MV_MAX_Q_DEPTH),
> MV_CRPB_Q_SZ = (8 * MV_MAX_Q_DEPTH),
> - MV_MAX_SG_CT = 176,
> + MV_MAX_SG_CT = 256,
> MV_SG_TBL_SZ = (16 * MV_MAX_SG_CT),
> - MV_PORT_PRIV_DMA_SZ = (MV_CRQB_Q_SZ + MV_CRPB_Q_SZ + MV_SG_TBL_SZ),
> + MV_PORT_PRIV_DMA1_SZ = (MV_CRQB_Q_SZ + MV_CRPB_Q_SZ),
> + MV_PORT_PRIV_DMA2_SZ = (MV_SG_TBL_SZ),
>
> MV_PORTS_PER_HC = 4,
> /* == (port / MV_PORTS_PER_HC) to determine HC from 0-7 port */
> @@ -98,7 +99,7 @@
>
> CRPB_FLAG_STATUS_SHIFT = 8,
>
> - EPRD_FLAG_END_OF_TBL = (1 << 31),
> + EPRD_FLAG_END_OF_TBL = (1 << 15),
>
> /* PCI interface registers */
>
> @@ -257,29 +258,34 @@
> chip_608x,
> };
>
> +#pragma pack(1)
> +
> /* Command ReQuest Block: 32B */
> struct mv_crqb {
> - u32 sg_addr;
> - u32 sg_addr_hi;
> - u16 ctrl_flags;
> + volatile u32 sg_addr;
> + volatile u32 sg_addr_hi;
> + volatile u16 ctrl_flags;
> u16 ata_cmd[11];
> };
>
> /* Command ResPonse Block: 8B */
> struct mv_crpb {
> - u16 id;
> - u16 flags;
> - u32 tmstmp;
> + volatile u16 id;
> + volatile u16 flags;
> + volatile u32 tmstmp;
> };
>
> /* EDMA Physical Region Descriptor (ePRD); A.K.A. SG */
> struct mv_sg {
> - u32 addr;
> - u32 flags_size;
> - u32 addr_hi;
> - u32 reserved;
> + volatile u32 addr;
> + volatile u16 size;
> + volatile u16 flags;
> + volatile u32 addr_hi;
> + volatile u32 reserved;
> };
>
> +#pragma pack(0)
> +
> struct mv_port_priv {
> struct mv_crqb *crqb;
> dma_addr_t crqb_dma;
> @@ -294,8 +300,8 @@
> };
>
> struct mv_port_signal {
> - u32 amps;
> - u32 pre;
> + volatile u32 amps;
> + volatile u32 pre;
> };

see comment on 'volatile', above. We can't add these changes.


> struct mv_host_priv;
> @@ -365,7 +371,7 @@
> .eh_strategy_handler = ata_scsi_error,
> .can_queue = MV_USE_Q_DEPTH,
> .this_id = ATA_SHT_THIS_ID,
> - .sg_tablesize = MV_MAX_SG_CT / 2,
> + .sg_tablesize = MV_MAX_SG_CT,

This is adding a bug.

The IOMMU worst case requires a split for each s/g entry, due to DMA
boundary issues. See mv_fill_sg() or ata_fill_sg().

Thus, the above "/ 2" is required.


> .max_sectors = ATA_MAX_SECTORS,
> .cmd_per_lun = ATA_SHT_CMD_PER_LUN,
> .emulated = ATA_SHT_EMULATED,
> @@ -509,10 +515,7 @@
> .reset_bus = mv_reset_pci_bus,
> };
>
> -/*
> - * module options
> - */
> -static int msi; /* Use PCI msi; either zero (off, default) or non-zero */
> +static int msi; /* Use PCI msi; either zero (off, default) or non-zero */

what changed here?


> /*
> @@ -770,7 +773,8 @@
>
> static inline void mv_priv_free(struct mv_port_priv *pp, struct device *dev)
> {
> - dma_free_coherent(dev, MV_PORT_PRIV_DMA_SZ, pp->crpb, pp->crpb_dma);
> + dma_free_coherent(dev, MV_PORT_PRIV_DMA1_SZ, pp->crpb, pp->crpb_dma);
> + dma_free_coherent(dev, MV_PORT_PRIV_DMA2_SZ, pp->sg_tbl, pp->sg_tbl_dma);
> }
>
> /**
> @@ -788,8 +792,8 @@
> struct device *dev = ap->host_set->dev;
> struct mv_port_priv *pp;
> void __iomem *port_mmio = mv_ap_base(ap);
> - void *mem;
> - dma_addr_t mem_dma;
> + void *mem, *mem2;
> + dma_addr_t mem_dma, mem_dma2;
> int rc = -ENOMEM;
>
> pp = kmalloc(sizeof(*pp), GFP_KERNEL);
> @@ -797,11 +801,17 @@
> goto err_out;
> memset(pp, 0, sizeof(*pp));
>
> - mem = dma_alloc_coherent(dev, MV_PORT_PRIV_DMA_SZ, &mem_dma,
> + mem = dma_alloc_coherent(dev, MV_PORT_PRIV_DMA1_SZ, &mem_dma,
> GFP_KERNEL);
> if (!mem)
> goto err_out_pp;
> - memset(mem, 0, MV_PORT_PRIV_DMA_SZ);
> + memset(mem, 0, MV_PORT_PRIV_DMA1_SZ);
> +
> + mem2 = dma_alloc_coherent(dev, MV_PORT_PRIV_DMA2_SZ, &mem_dma2,
> + GFP_KERNEL);
> + if (!mem2)
> + goto err_out_pp_2;
> + memset(mem2, 0, MV_PORT_PRIV_DMA2_SZ);
>
> rc = ata_pad_alloc(ap, dev);
> if (rc)
> @@ -820,14 +830,12 @@
> */
> pp->crpb = mem;
> pp->crpb_dma = mem_dma;
> - mem += MV_CRPB_Q_SZ;
> - mem_dma += MV_CRPB_Q_SZ;
>
> /* Third item:
> * Table of scatter-gather descriptors (ePRD), 16 bytes each
> */
> - pp->sg_tbl = mem;
> - pp->sg_tbl_dma = mem_dma;
> + pp->sg_tbl = mem2;
> + pp->sg_tbl_dma = mem_dma2;

why two allocations?

why not just fix the [probable] alignment issue?

I also agree with John Stoffel's comments.

Jeff


2006-03-12 05:24:14

by Dan Aloni

[permalink] [raw]
Subject: Re: [PATCH] sata_mv: stabilize for 5081 and other fixes

On Sat, Mar 11, 2006 at 08:45:29PM -0500, Jeff Garzik wrote:
> Dan Aloni wrote:
> >Hello,
> >
> >With the patch below I've managed to stabilize the sata_mv driver
> >running a Marvell 5081 SATA controller. Prior to these changes it
> >would cause sporadic memory corruptions right after insmod. I prefer
> >this driver over Marvell's own driver which have all sorts of weird
> >bugs.
> >
> >The patch also increases the maximum possible number of SG entries
> >per IO to 256 (this large sg_tablesize is a requirement for the
> >application we are developing at my company, XIV). Notice that it
> >also zeros-out a reserved field in the SG, I'm not sure how it
> >affected stability but something did. I've also added the 'volatile'
> >qualifier to some fields that belong to structs involved with I/O.
>
> Adding 'volatile' is to be avoided. This is simply hiding bugs
> elsewhere. volatile is an "atom bomb" that kills quite valid
> optimizations, needlessly. Most likely you need to sprinkle rmb(),
> wmb(), and/or mb() in the correct locations.

I'm not sure if these memory barriers are even needed, or whether
volatile made any impact on stability - but I can isolate these
changes today and see if it has any impact simply by experimenting.

> Also, it isn't clear at all from your description precisely which
> changes are fixes, and which are cleanups and optimizations. It would
> be nice to get each category of changes split into two separate patches.

I would prefer to just minimize the whole thing to a patch that
only fixes the stabilty problem, less paranoidically. If you can
suggest such patch for me to test I'd be happy to give it a try.

> > struct mv_host_priv;
> >@@ -365,7 +371,7 @@
> > .eh_strategy_handler = ata_scsi_error,
> > .can_queue = MV_USE_Q_DEPTH,
> > .this_id = ATA_SHT_THIS_ID,
> >- .sg_tablesize = MV_MAX_SG_CT / 2,
> >+ .sg_tablesize = MV_MAX_SG_CT,
>
> This is adding a bug.
>
> The IOMMU worst case requires a split for each s/g entry, due to DMA
> boundary issues. See mv_fill_sg() or ata_fill_sg().
>
> Thus, the above "/ 2" is required.

Interesting, I'll look into that. I wonder how it managed to work
so far.

> > .max_sectors = ATA_MAX_SECTORS,
> > .cmd_per_lun = ATA_SHT_CMD_PER_LUN,
> > .emulated = ATA_SHT_EMULATED,
> >@@ -509,10 +515,7 @@
> > .reset_bus = mv_reset_pci_bus,
> > };
> >
> >-/*
> >- * module options
> >- */
> >-static int msi; /* Use PCI msi; either zero (off, default) or
> >non-zero */
> >+static int msi; /* Use PCI msi; either zero (off, default)
> >or non-zero */
>
> what changed here?

Nothing, that's just a diff hunk I should have cleaned away.

> > /*
> >@@ -770,7 +773,8 @@
> >
> > static inline void mv_priv_free(struct mv_port_priv *pp, struct device
> > *dev)
> > {
> >- dma_free_coherent(dev, MV_PORT_PRIV_DMA_SZ, pp->crpb, pp->crpb_dma);
> >+ dma_free_coherent(dev, MV_PORT_PRIV_DMA1_SZ, pp->crpb, pp->crpb_dma);
> >+ dma_free_coherent(dev, MV_PORT_PRIV_DMA2_SZ, pp->sg_tbl,
> >pp->sg_tbl_dma);
> > }
> >
> > /**
> >@@ -788,8 +792,8 @@
> > struct device *dev = ap->host_set->dev;
> > struct mv_port_priv *pp;
> > void __iomem *port_mmio = mv_ap_base(ap);
> >- void *mem;
> >- dma_addr_t mem_dma;
> >+ void *mem, *mem2;
> >+ dma_addr_t mem_dma, mem_dma2;
> > int rc = -ENOMEM;
> >
> > pp = kmalloc(sizeof(*pp), GFP_KERNEL);
> >@@ -797,11 +801,17 @@
> > goto err_out;
> > memset(pp, 0, sizeof(*pp));
> >
> >- mem = dma_alloc_coherent(dev, MV_PORT_PRIV_DMA_SZ, &mem_dma,
> >+ mem = dma_alloc_coherent(dev, MV_PORT_PRIV_DMA1_SZ, &mem_dma,
> > GFP_KERNEL);
> > if (!mem)
> > goto err_out_pp;
> >- memset(mem, 0, MV_PORT_PRIV_DMA_SZ);
> >+ memset(mem, 0, MV_PORT_PRIV_DMA1_SZ);
> >+
> >+ mem2 = dma_alloc_coherent(dev, MV_PORT_PRIV_DMA2_SZ, &mem_dma2,
> >+ GFP_KERNEL);
> >+ if (!mem2)
> >+ goto err_out_pp_2;
> >+ memset(mem2, 0, MV_PORT_PRIV_DMA2_SZ);
> >
> > rc = ata_pad_alloc(ap, dev);
> > if (rc)
> >@@ -820,14 +830,12 @@
> > */
> > pp->crpb = mem;
> > pp->crpb_dma = mem_dma;
> >- mem += MV_CRPB_Q_SZ;
> >- mem_dma += MV_CRPB_Q_SZ;
> >
> > /* Third item:
> > * Table of scatter-gather descriptors (ePRD), 16 bytes each
> > */
> >- pp->sg_tbl = mem;
> >- pp->sg_tbl_dma = mem_dma;
> >+ pp->sg_tbl = mem2;
> >+ pp->sg_tbl_dma = mem_dma2;
>
> why two allocations?

A 256 entry SG array takes a PAGE_SIZE, I didn't want to allocate more
than PAGE_SIZE to avoid fragmentation problems.

> why not just fix the [probable] alignment issue?

Yes, I forgot these allocations should be aligned (by 16, right?).

> I also agree with John Stoffel's comments.

Me too.

--
Dan Aloni
[email protected], [email protected], [email protected], [email protected]

2006-03-12 05:49:09

by Dan Aloni

[permalink] [raw]
Subject: Re: [PATCH] sata_mv: stabilize for 5081 and other fixes

On Sat, Mar 11, 2006 at 08:45:29PM -0500, Jeff Garzik wrote:
>
> This is adding a bug.
>
> The IOMMU worst case requires a split for each s/g entry, due to DMA
> boundary issues. See mv_fill_sg() or ata_fill_sg().
>
> Thus, the above "/ 2" is required.

Okay I figured it out - here we are using the SCSI sg driver, and
a scatter-gatter entry generated by that driver will never cross
a page boundery (nor a DMA boundary), because each userspace page
is mapped into one scatter-gatter entry, so in that case the "/ 2"
isn't needed. Coming to think about it, that's only valid on x86
and x86_64. Are there other architectures that break that assumption?

I'd still want to be able to read/write 1MB at a time, otherwise
it would require massive userspace code rewrites in our application,
(limiting it to 0.5MB). Do you have any suggestions about how to do
that? I mean, it is not trivial to pass 128 entries of 2*PAGE_SIZE
based on userspace memory.

p.s. I thought scatter-gatter entries are only valid for the page
they point to, it's good to learn new things :)

--
Dan Aloni
[email protected], [email protected], [email protected], [email protected]

2006-03-16 10:16:33

by Sander

[permalink] [raw]
Subject: Re: [PATCH] sata_mv: stabilize for 5081 and other fixes

Hi Dan,

Dan Aloni wrote (ao):
> With the patch below I've managed to stabilize the sata_mv driver
> running a Marvell 5081 SATA controller.

Your patch (applied to 2.6.16-rc6) seems to work on the MV88SX6081 too.
I have two Maxtor disks connected and in a raid0 configuration. The
array is both fast and stable. I see no error messages in dmesg and no
data corruption.

I have yet to test if raid5 works well too, so I'll report on that
later. It didn't work a month ago:

http://www.ussg.iu.edu/hypermail/linux/kernel/0602.2/0360.html

Thanks a lot for your patch!

Kind regards, Sander

--
Humilis IT Services and Solutions
http://www.humilis.net

2006-03-16 13:03:28

by Dan Aloni

[permalink] [raw]
Subject: Re: [PATCH] sata_mv: stabilize for 5081 and other fixes

On Thu, Mar 16, 2006 at 11:16:30AM +0100, Sander wrote:
> Hi Dan,
>
> Dan Aloni wrote (ao):
> > With the patch below I've managed to stabilize the sata_mv driver
> > running a Marvell 5081 SATA controller.
>
> Your patch (applied to 2.6.16-rc6) seems to work on the MV88SX6081 too.
> I have two Maxtor disks connected and in a raid0 configuration. The
> array is both fast and stable. I see no error messages in dmesg and no
> data corruption.

I take it that without the patch this particular configuration didn't
work?

> I have yet to test if raid5 works well too, so I'll report on that
> later. It didn't work a month ago:
>
> http://www.ussg.iu.edu/hypermail/linux/kernel/0602.2/0360.html
>
> Thanks a lot for your patch!

Good, I plan to post a cleaner version of it and have it merged.

--
Dan Aloni
[email protected], [email protected], [email protected], [email protected]

2006-03-20 08:30:25

by Sander

[permalink] [raw]
Subject: Re: [PATCH] sata_mv: stabilize for 5081 and other fixes

Dan Aloni wrote (ao):
> On Thu, Mar 16, 2006 at 11:16:30AM +0100, Sander wrote:
> > Dan Aloni wrote (ao):
> > > With the patch below I've managed to stabilize the sata_mv driver
> > > running a Marvell 5081 SATA controller.
> >
> > Your patch (applied to 2.6.16-rc6) seems to work on the MV88SX6081
> > too. I have two Maxtor disks connected and in a raid0 configuration.
> > The array is both fast and stable. I see no error messages in dmesg
> > and no data corruption.
>
> I take it that without the patch this particular configuration didn't
> work?

No, that was raid5 only.

Now I tried to test as much of the configuration that did not work for
me. The current testsystem is a bit different from the original though,
as that was with Maxtor Pro 500 disks, and the current one Maxtor
MaxLine10 disks. Also the kernel was 2.6.16-rc3, and now is 2.6.16-rc6.

I do:

mdadm -C -l5 -n8 /dev/md0 /dev/sda1 /dev/sdb1 /dev/sdc1 \
/dev/sdd1 /dev/sde1 /dev/sdf1 /dev/sdg1 /dev/sdh1

mke2fs -j -m1 /dev/md0

mount -o data=writeback /dev/md0 /mnt

for i in `seq 10`
do dd if=/dev/zero of=bigfile.$i bs=1024k count=10000
done
md5sum bigfile.*
rm bigfile.*

With plain 2.6.16-rc6 the system crashes during one of the md5sums, most
of the time the first or the second file. I've lost the last line on
netconsole, but will reproduce with 2.6.16.

With 2.6.16-rc6 and your sata_mv patch the system also crashes during
one of the md5sums, most of the time the first or the second file. There
is no output on netconsole.

2.6.16-rc6-mm2 seems to work, which I did not expect..

--
Humilis IT Services and Solutions
http://www.humilis.net