Subject: [PATCH 06/13]  [RFC] ipath LLD core, part 3
In-Reply-To: <200512161548.YvnmQHKTsmmCBp1k@cisco.com>
Date: Fri, 16 Dec 2005 15:48:55 -0800
Message-Id: <200512161548.KglSM2YESlGlEQfQ@cisco.com>
Mime-Version: 1.0
Content-Type: text/plain; charset=US-ASCII
To: linux-kernel@vger.kernel.org, openib-general@openib.org
Content-Transfer-Encoding: 7BIT
From: Roland Dreier <rolandd@cisco.com>
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 78986
Lines: 2403

Last part of core driver

---

 drivers/infiniband/hw/ipath/ipath_driver.c | 2380 ++++++++++++++++++++++++++++
 1 files changed, 2380 insertions(+), 0 deletions(-)

f7ffc0cabd62be5e13ad84027d5712e6f92d9cc1
diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c
index 0dee4ce..87b6dae 100644
--- a/drivers/infiniband/hw/ipath/ipath_driver.c
+++ b/drivers/infiniband/hw/ipath/ipath_driver.c
@@ -4877,3 +4877,2383 @@ static int ipath_wait_intr(ipath_portdat
 	}
 	return 0;
 }
+
+/*
+ * The new implementation as of Oct 2004 is that the driver assigns
+ * the tid and returns it to the caller.   To make it easier to
+ * catch bugs, and to reduce search time, we keep a cursor for
+ * each port, walking the shadow tid array to find one that's not
+ * in use.
+ *
+ * For now, if we can't allocate the full list, we fail, although
+ * in the long run, we'll allocate as many as we can, and the
+ * caller will deal with that by trying the remaining pages later.
+ * That means that when we fail, we have to mark the tids as not in
+ * use again, in our shadow copy.
+ *
+ * It's up to the caller to free the tids when they are done.
+ * We'll unlock the pages as they free them.
+ *
+ * Also, right now we are locking one page at a time, but since
+ * the intended use of this routine is for a single group of
+ * virtually contiguous pages, that should change to improve
+ * performance.
+ */
+static int ipath_tid_update(ipath_portdata * pd, struct _tidupd *tidu)
+{
+	int ret = 0, ntids;
+	uint32_t tid, porttid, cnt, i, tidcnt;
+	struct _tidupd tu;
+	uint16_t *tidlist;
+	ipath_devdata *dd = &devdata[pd->port_unit];
+	uint64_t vaddr, physaddr, lenvalid;
+	volatile uint64_t *tidbase;
+	uint64_t tidmap[8];
+	struct page **pagep = NULL;
+
+	tu.tidcnt = 0;		/* for early errors */
+	if (!dd->ipath_pageshadow) {
+		ret = -ENOMEM;
+		goto done;
+	}
+	if (copy_from_user(&tu, tidu, sizeof tu)) {
+		ret = -EFAULT;
+		goto done;
+	}
+
+	if (!(cnt = tu.tidcnt)) {
+		_IPATH_DBG("After copyin, tidcnt 0, tidlist %llx\n",
+			   tu.tidlist);
+		/* or should we treat as success?  likely a bug */
+		ret = -EFAULT;
+		goto done;
+	}
+	tidcnt = dd->ipath_rcvtidcnt;
+	if (cnt >= tidcnt) {	/* make sure it all fits in port_tid_pg_list */
+		_IPATH_INFO
+		    ("Process tried to allocate %u TIDs, only trying max (%u)\n",
+		     cnt, tidcnt);
+		cnt = tidcnt;
+	}
+	pagep = (struct page **)pd->port_tid_pg_list;
+	tidlist = (uint16_t *) (&pagep[cnt]);
+
+	memset(tidmap, 0, sizeof(tidmap));
+	tid = pd->port_tidcursor;
+	/* before decrement; chip actual # */
+	porttid = pd->port_port * tidcnt;
+	ntids = tidcnt;
+	tidbase = (volatile uint64_t *)((volatile char *)
+					(devdata[pd->port_unit].
+					 ipath_kregbase) +
+					devdata[pd->port_unit].
+					ipath_rcvtidbase +
+					porttid * sizeof(*tidbase));
+
+	_IPATH_VDBG("Port%u %u tids, cursor %u, tidbase %p\n", pd->port_port,
+		    cnt, tid, tidbase);
+
+	vaddr = tu.tidvaddr;	/* virtual address of first page in transfer */
+	if (!access_ok(VERIFY_WRITE, (void *)vaddr, cnt * PAGE_SIZE)) {
+		_IPATH_DBG("Fail vaddr %llx, %u pages, !access_ok\n",
+			   vaddr, cnt);
+		ret = -EFAULT;
+		goto done;
+	}
+	if ((ret = ipath_mlock((unsigned long)vaddr, cnt, pagep))) {
+		if (ret == -EBUSY) {
+			_IPATH_DBG
+			    ("Failed to lock addr %p, %u pages (already locked)\n",
+			     (void *)vaddr, cnt);
+			/*
+			 * for now, continue, and see what happens
+			 * but with the new implementation, this should
+			 * never happen, unless perhaps the user has
+			 * mpin'ed the pages themselves (something we
+			 * need to test)
+			 */
+			ret = 0;
+		} else {
+			_IPATH_INFO
+			    ("Failed to lock addr %p, %u pages: errno %d\n",
+			     (void *)vaddr, cnt, -ret);
+			goto done;
+		}
+	}
+	for (i = 0; i < cnt; i++, vaddr += PAGE_SIZE) {
+		for (; ntids--; tid++) {
+			if (tid == tidcnt)
+				tid = 0;
+			if (!dd->ipath_pageshadow[porttid + tid])
+				break;
+		}
+		if (ntids < 0) {
+			/*
+			 * oops, wrapped all the way through their TIDs,
+			 * and didn't have enough free; see comments at
+			 * start of routine
+			 */
+			_IPATH_DBG
+			    ("Not enough free TIDs for %u pages (index %d), failing\n",
+			     cnt, i);
+			i--;	/* last tidlist[i] not filled in */
+			ret = -ENOMEM;
+			break;
+		}
+		tidlist[i] = tid;
+		_IPATH_VDBG("Updating idx %u to TID %u, vaddr %llx\n",
+			    i, tid, vaddr);
+		/* for now we "know" system pages and TID pages are same size */
+		/* for ipath_free_tid */
+		dd->ipath_pageshadow[porttid + tid] = pagep[i];
+		__set_bit(tid, tidmap);	/* don't need atomic or it's overhead */
+		physaddr = page_to_phys(pagep[i]);
+		ipath_stats.sps_pagelocks++;
+		_IPATH_VDBG("TID %u, vaddr %llx, physaddr %llx pgp %p\n",
+			    tid, vaddr, physaddr, pagep[i]);
+		/*
+		 * in words (fixed, full page).  could make less for very last
+		 * page in transfer, but for now we won't worry about it.
+		 */
+		lenvalid = PAGE_SIZE >> 2;
+		lenvalid <<= INFINIPATH_RT_BUFSIZE_SHIFT;
+		physaddr |= lenvalid | INFINIPATH_RT_VALID;
+		ipath_kput_memq(pd->port_unit, &tidbase[tid], physaddr);
+		/*
+		 * don't check this tid in ipath_portshadow, since we
+		 * just filled it in; start with the next one.
+		 */
+		tid++;
+	}
+
+	if (ret) {
+		uint32_t limit;
+		uint64_t tidval;
+		/*
+		 * chip errata bug 7358, try to work around it by
+		 * marking invalid tids as having max length
+		 */
+		tidval =
+		    (~0ULL & INFINIPATH_RT_BUFSIZE_MASK) <<
+		    INFINIPATH_RT_BUFSIZE_SHIFT;
+	      cleanup:
+		/* jump here if copy out of updated info failed... */
+		_IPATH_DBG("After failure (ret=%d), undo %d of %d entries\n",
+			   -ret, i, cnt);
+		/* same code that's in ipath_free_tid() */
+		if ((limit = sizeof(tidmap) * _BITS_PER_BYTE) > tidcnt)
+			/* just in case size changes in future */
+			limit = tidcnt;
+		tid = find_first_bit((const unsigned long *)tidmap, limit);
+		/*
+		 * chip errata bug 7358, try to work around it by
+		 * marking invalid tids as having max length
+		 */
+		tidval =
+		    (~0ULL & INFINIPATH_RT_BUFSIZE_MASK) <<
+		    INFINIPATH_RT_BUFSIZE_SHIFT;
+		for (; tid < limit; tid++) {
+			if (!test_bit(tid, tidmap))
+				continue;
+			if (dd->ipath_pageshadow[porttid + tid]) {
+				_IPATH_VDBG("Freeing TID %u\n", tid);
+				ipath_kput_memq(pd->port_unit, &tidbase[tid],
+						tidval);
+				dd->ipath_pageshadow[porttid + tid] = NULL;
+				ipath_stats.sps_pageunlocks++;
+			}
+		}
+		(void)ipath_munlock(cnt, pagep);
+	} else {
+		/*
+		 * copy the updated array, with ipath_tid's filled in,
+		 * back to user.  Since we did the copy in already, this
+		 * "should never fail"
+		 * If it does, we have to clean up...
+		 */
+		int r;
+		if ((r =
+		     copy_to_user((void *)tu.tidlist, tidlist,
+				  cnt * sizeof(*tidlist)))) {
+			_IPATH_DBG
+			    ("Failed to copy out %d TIDs (%lx bytes) to %llx (ret %x)\n",
+			     cnt, cnt * sizeof(*tidlist), tu.tidlist, r);
+			ret = -EFAULT;
+			goto cleanup;
+		}
+		if (copy_to_user((void *)tu.tidmap, tidmap, sizeof tidmap)) {
+			_IPATH_DBG("Failed to copy out TID map to %llx\n",
+				   tu.tidmap);
+			ret = -EFAULT;
+			goto cleanup;
+		}
+		if (tid == tidcnt)
+			tid = 0;
+		pd->port_tidcursor = tid;
+	}
+
+done:
+	if (ret)
+		_IPATH_DBG
+		    ("Failed to map %u TID pages, failing with %d, tidu %p\n",
+		     tu.tidcnt, -ret, tidu);
+	return ret;
+}
+
+/*
+ * right now we are unlocking one page at a time, but since
+ * the intended use of this routine is for a single group of
+ * virtually contiguous pages, that should change to improve
+ * performance.  We check that the TID is in range for this port
+ * but otherwise don't check validity; if user has an error and
+ * frees the wrong tid, it's only their own data that can thereby
+ * be corrupted.  We do check that the TID was in use, for sanity
+ * We always use our idea of the saved address, not the address that
+ * they pass in to us.
+ */
+
+static int ipath_tid_free(ipath_portdata * pd, struct _tidupd *tidu)
+{
+	int ret = 0;
+	uint32_t tid, porttid, cnt, limit, tidcnt;
+	struct _tidupd tu;
+	ipath_devdata *dd = &devdata[pd->port_unit];
+	uint64_t *tidbase;
+	uint64_t tidmap[8];
+	uint64_t tidval;
+
+	tu.tidcnt = 0;		/* for early errors */
+	if (!dd->ipath_pageshadow) {
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	if (copy_from_user(&tu, tidu, sizeof tu)) {
+		_IPATH_DBG("copy of tidupd structure failed\n");
+		ret = -EFAULT;
+		goto done;
+	}
+	if (copy_from_user(tidmap, (void *)tu.tidmap, sizeof tidmap)) {
+		_IPATH_DBG("copy of tidmap failed\n");
+		ret = -EFAULT;
+		goto done;
+	}
+
+	porttid = pd->port_port * dd->ipath_rcvtidcnt;
+	tidbase =
+	    (uint64_t *) ((char *)(devdata[pd->port_unit].ipath_kregbase) +
+			  devdata[pd->port_unit].ipath_rcvtidbase +
+			  porttid * sizeof(*tidbase));
+
+	tidcnt = dd->ipath_rcvtidcnt;
+	if ((limit = sizeof(tidmap) * _BITS_PER_BYTE) > tidcnt)
+		limit = tidcnt;	/* just in case size changes in future */
+	tid = find_first_bit((const unsigned long *)tidmap, limit);
+	_IPATH_VDBG
+	    ("Port%u free %u tids; first bit (max=%d) set is %d, porttid %u\n",
+	     pd->port_port, tu.tidcnt, limit, tid, porttid);
+	/*
+	 * chip errata bug 7358, try to work around it by marking invalid
+	 * tids as having max length
+	 */
+	tidval =
+	    (~0ULL & INFINIPATH_RT_BUFSIZE_MASK) << INFINIPATH_RT_BUFSIZE_SHIFT;
+	for (cnt = 0; tid < limit; tid++) {
+		/*
+		 * small optimization; if we detect a run of 3 or so without
+		 * any set, use find_first_bit again.  That's mainly to 
+		 * accelerate the case where we wrapped, so we have some at 
+		 * the beginning, and some at the end, and a big gap
+		 * in the middle.
+		 */
+		if (!test_bit(tid, tidmap))
+			continue;
+		cnt++;
+		if (dd->ipath_pageshadow[porttid + tid]) {
+			_IPATH_VDBG("Freeing TID %u\n", tid);
+			ipath_kput_memq(pd->port_unit, &tidbase[tid], tidval);
+			ipath_munlock(1, &dd->ipath_pageshadow[porttid + tid]);
+			dd->ipath_pageshadow[porttid + tid] = NULL;
+			ipath_stats.sps_pageunlocks++;
+		} else
+			_IPATH_DBG("Unused tid %u, ignoring\n", tid);
+	}
+	if (cnt != tu.tidcnt)
+		_IPATH_DBG("passed in tidcnt %d, only %d bits set in map\n",
+			   tu.tidcnt, cnt);
+done:
+	if (ret)
+		_IPATH_DBG("Failed to unmap %u TID pages, failing with %d\n",
+			   tu.tidcnt, -ret);
+	return ret;
+}
+
+/* called from user init code, and also layered driver init */
+int ipath_setrcvhdrsize(const ipath_type mdev, unsigned rhdrsize)
+{
+	int ret = 0;
+	if (devdata[mdev].ipath_flags & IPATH_RCVHDRSZ_SET) {
+		if (devdata[mdev].ipath_rcvhdrsize != rhdrsize) {
+			_IPATH_INFO
+			    ("Error: can't set protocol header size %u, already %u\n",
+			     rhdrsize, devdata[mdev].ipath_rcvhdrsize);
+			ret = -EAGAIN;
+		} else
+			/* OK if set already, with same value, nothing to do */
+			_IPATH_VDBG("Reuse same protocol header size %u\n",
+				    devdata[mdev].ipath_rcvhdrsize);
+	} else if (rhdrsize >
+		   (devdata[mdev].ipath_rcvhdrentsize -
+		    (sizeof(uint64_t) / sizeof(uint32_t)))) {
+		_IPATH_DBG
+		    ("Error: can't set protocol header size %u (> max %u)\n",
+		     rhdrsize,
+		     devdata[mdev].ipath_rcvhdrentsize -
+		     (uint32_t) (sizeof(uint64_t) / sizeof(uint32_t)));
+		ret = -EOVERFLOW;
+	} else {
+		devdata[mdev].ipath_flags |= IPATH_RCVHDRSZ_SET;
+		devdata[mdev].ipath_rcvhdrsize = rhdrsize;
+		ipath_kput_kreg(mdev, kr_rcvhdrsize,
+				devdata[mdev].ipath_rcvhdrsize);
+		_IPATH_VDBG("Set protocol header size to %u\n",
+			    devdata[mdev].ipath_rcvhdrsize);
+	}
+	return ret;
+}
+
+/*
+ * find an available pio buffer, and do appropriate marking as busy, etc.
+ * returns buffer number if one found (>=0), negative number is error.
+ * Used by ipath_send_smapkt and ipath_layer_send
+ */
+int ipath_getpiobuf(int mdev)
+{
+	int i, j, starti, updated = 0;
+	unsigned piobcnt, iter;
+	unsigned long flags;
+	ipath_devdata *dd = &devdata[mdev];
+	uint64_t *shadow = dd->ipath_pioavailshadow;
+
+	piobcnt = (unsigned)dd->ipath_piobcnt;
+	starti = dd->ipath_lastport_piobuf;
+	iter = piobcnt - starti;
+	if (dd->ipath_upd_pio_shadow) {
+		/*
+		 * minor optimization.	If we had no buffers on last call,
+		 * start out by doing the update; continue and do scan
+		 * even if no buffers were updated, to be paranoid
+		 */
+		ipath_update_pio_bufs(mdev);
+		/* we scanned here, don't do it at end of scan */
+		updated = 1;
+		i = starti;
+	} else
+		i = dd->ipath_lastpioindex;
+
+rescan:
+	/*
+	 * while test_and_set_bit() is atomic,
+	 * we do that and then the change_bit(), and the pair is not.
+	 * See if this is the cause of the remaining armlaunch errors.
+	 */
+	spin_lock_irqsave(&ipath_pioavail_lock, flags);
+	for (j = 0; j < iter; j++, i++) {
+		if (i >= piobcnt)
+			i = starti;
+		/*
+		 * To avoid bus lock overhead, we first find a candidate
+		 * buffer, then do the test and set, and continue if
+		 * that fails.
+		 */
+		if (test_bit((2 * i) + 1, shadow) ||
+		    test_and_set_bit((2 * i) + 1, shadow)) {
+			continue;
+		}
+		/* flip generation bit */
+		change_bit(2 * i, shadow);
+		break;
+	}
+	spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
+
+	if (j == iter) {
+		/*
+		 * first time through; shadow exhausted, but may be
+		 * real buffers available, so go see; if any updated,
+		 * rescan (once)
+		 */
+		if (!updated) {
+			ipath_update_pio_bufs(mdev);
+			updated = 1;
+			i = starti;
+			goto rescan;
+		}
+		dd->ipath_upd_pio_shadow = 1;
+		/* not atomic, but if we lose one once in a while, that's OK */
+		ipath_stats.sps_nopiobufs++;
+		if (!(++dd->ipath_consec_nopiobuf % 100000)) {
+			_IPATH_DBG
+			    ("%u pio sends with no bufavail; dmacopy: %llx %llx %llx %llx; shadow:  %llx %llx %llx %llx\n",
+			     dd->ipath_consec_nopiobuf,
+			     dd->ipath_pioavailregs_dma[0],
+			     dd->ipath_pioavailregs_dma[1],
+			     dd->ipath_pioavailregs_dma[2],
+			     dd->ipath_pioavailregs_dma[3],
+			     shadow[0], shadow[1], shadow[2], shadow[3]);
+			/*
+			 * 4 buffers per byte, 4 registers above, cover
+			 * rest below
+			 */
+			if (dd->ipath_piobcnt > (sizeof(shadow[0]) * 4 * 4))
+				_IPATH_DBG
+				    ("2nd group: dmacopy: %llx %llx %llx %llx; shadow: %llx %llx %llx %llx\n",
+				     dd->ipath_pioavailregs_dma[4],
+				     dd->ipath_pioavailregs_dma[5],
+				     dd->ipath_pioavailregs_dma[6],
+				     dd->ipath_pioavailregs_dma[7],
+				     shadow[4], shadow[5], shadow[6],
+				     shadow[7]);
+		}
+		return -EBUSY;
+	}
+
+	if (updated && dd->ipath_layer.l_intr) {
+		/*
+		 * ran out of bufs, now some (at least this one we just got)
+		 * are now available, so tell the layered driver.
+		 */
+		 dd->ipath_layer.l_intr(mdev, IPATH_LAYER_INT_SEND_CONTINUE);
+	}
+
+	/*
+	 * set next starting place.  Since it's just an optimization,
+	 * it doesn't matter who wins on this, so no locking
+	 */
+	dd->ipath_lastpioindex = i + 1;
+	if(dd->ipath_upd_pio_shadow)
+		dd->ipath_upd_pio_shadow = 0;
+	if(dd->ipath_consec_nopiobuf)
+		dd->ipath_consec_nopiobuf = 0;
+	return i;
+}
+
+/*
+ * this is like ipath_getpiobuf(), except it just probes to see if a buffer
+ * is available.  If it returns that there is one, it's not allocated,
+ * and so may not be available if caller tries to send.
+ * NOTE: This can be called from interrupt context by ipath_intr()
+ * and from non-interrupt context by layer_send_getpiobuf().
+ */
+int ipath_bufavail(int mdev)
+{
+	int i;
+	unsigned piobcnt;
+	uint64_t *shadow = devdata[mdev].ipath_pioavailshadow;
+
+	piobcnt = (unsigned)devdata[mdev].ipath_piobcnt;
+
+	for (i = devdata[mdev].ipath_lastport_piobuf; i < piobcnt; i++)
+		if (!test_bit((2 * i) + 1, shadow))
+			return 1;
+
+	/* if none, check for update and rescan if we updated */
+	ipath_update_pio_bufs(mdev);
+	for (i = devdata[mdev].ipath_lastport_piobuf; i < piobcnt; i++)
+		if (!test_bit((2 * i) + 1, shadow))
+			return 1;
+	_IPATH_PDBG("No bufs avail\n");
+	return 0;
+}
+
+/*
+ * This routine is no longer on any critical paths; it is used only
+ * for sending SMA packets, but that could change in the future, so it
+ * should be kept pretty tight, with anything that
+ * increases the cache footprint, adds branches, etc. carefully
+ * examined, and if needed only for unusual cases, should, be moved out to
+ * a separate routine, or out of the main execution path.
+ * Because it's currently sma only, there are no checks to see if the
+ * link is up; sma must be able to send in the not fully initialized state
+ */
+int ipath_send_smapkt(struct ipath_sendpkt * upkt)
+{
+	int i, ret = 0, whichpb;
+	uint32_t *piobuf, plen = 0, clen;
+	uint64_t pboff;
+	struct ipath_sendpkt kpkt;
+	struct ipath_iovec *iov = kpkt.sps_iov;
+	ipath_type t;
+
+	if (unlikely((copy_from_user(&kpkt, upkt, sizeof kpkt))))
+		ret = -EFAULT;
+	if (ret) {
+		_IPATH_VDBG("Send failed: error %d\n", -ret);
+		goto done;
+	}
+	t = kpkt.sps_flags;
+	if (t >= infinipath_max || !(devdata[t].ipath_flags & IPATH_PRESENT) ||
+	    !devdata[t].ipath_kregbase) {
+		_IPATH_SMADBG("illegal unit %u for sma send\n", t);
+		return -ENODEV;
+	}
+	if (!(devdata[t].ipath_flags & IPATH_INITTED)) {
+		/* no hardware, freeze, etc. */
+		_IPATH_SMADBG("unit %u not usable\n", t);
+		return -ENODEV;
+	}
+
+	/* need total length before first word written */
+	plen = sizeof(uint32_t);	/* +1 word is for the qword padding */
+	for (i = 0; i < kpkt.sps_cnt; i++)
+		/* each must be dword multiple */
+		plen += kpkt.sps_iov[i].iov_len;
+
+	if ((plen + 4) > devdata[t].ipath_ibmaxlen) {
+		_IPATH_DBG("Pkt len 0x%x > ibmaxlen %x!\n", plen - 4,
+			   devdata[t].ipath_ibmaxlen);
+		ret = -EINVAL;
+		goto done;	/* before writing pbc */
+	}
+	plen >>= 2;		/* in words */
+
+	whichpb = ipath_getpiobuf(t);
+	if (whichpb < 0) {
+		ret = whichpb;
+		devdata[t].ipath_nosma_bufs++;
+		_IPATH_SMADBG("No PIO buffers available unit %u %u times\n",
+			t, devdata[t].ipath_nosma_bufs);
+		goto done;
+	}
+	if(devdata[t].ipath_nosma_bufs) {
+		_IPATH_SMADBG(
+			"Unit %u got SMA send buffer after %u failures, %u seconds\n",
+			t, devdata[t].ipath_nosma_bufs, devdata[t].ipath_nosma_secs);
+		devdata[t].ipath_nosma_bufs = 0;
+		devdata[t].ipath_nosma_secs = 0;
+	}
+	if((devdata[t].ipath_lastibcstat & 0x11) != 0x11 &&
+		(devdata[t].ipath_lastibcstat & 0x21) != 0x21) {
+	    /* we need to be at least at INIT for SMA packets to go out.  If we
+	     * aren't, something has gone wrong, and SMA hasn't noticed.
+	     * Therefore we'll try to go to INIT here, in hopes of fixing up the
+	     * problem.  First we verify that indeed the state is still "bad"
+	     * (that is, that lastibcstat * isn't "stale") */
+	    uint64_t val;
+	    val = ipath_kget_kreg64(t, kr_ibcstatus);
+	    if((val & 0x11) != 0x11 && (val & 0x21) != 0x21) {
+		_IPATH_SMADBG("Invalid Link state 0x%llx unit %u for send, try INIT\n",
+			val, t);
+		ipath_set_ib_lstate(t, INFINIPATH_IBCC_LINKCMD_INIT);
+		val = ipath_kget_kreg64(t, kr_ibcstatus);
+		if((val & 0x11) != 0x11 && (val & 0x21) != 0x21)
+		    _IPATH_SMADBG("Link state still not OK unit %u (0x%llx) after INIT\n",
+				t, val);
+		else
+		    _IPATH_SMADBG("Link state OK unit %u (0x%llx) after INIT\n",
+				t, val);
+	    }
+	    /* and continue, regardless */
+	}
+
+	pboff = devdata[t].ipath_piobufbase;
+	piobuf = (uint32_t *) (((char *)(devdata[t].ipath_kregbase)) + pboff
+			       + whichpb * devdata[t].ipath_palign);
+
+	if(infinipath_debug & __IPATH_PKTDBG) // SMA and PKT, both
+		_IPATH_SMADBG("unit %u 0x%x+1w pio%d, (scnt %d)\n",
+			t, plen - 1, whichpb, kpkt.sps_cnt);
+
+	ret = 0;
+	clen = 2;		/* size of the pbc */
+	{
+		/*
+		 * If this code ever gets used for anything performance
+		 * oriented, or that isn't inherently single-threaded,
+		 * then I need to implement the original idea of our
+		 * own equivalent of copy_from_user that uses only dword
+		 * or qword copies.  copy_from_user() can use byte copies,
+		 * and that is a problem for our chip.
+		 */
+		static uint32_t tmpbuf[2176 / sizeof(uint32_t)];
+		*(uint64_t *) tmpbuf = (uint64_t) plen;
+		for (i = 0; i < kpkt.sps_cnt; i++) {
+			if (unlikely
+			    (copy_from_user
+			     (tmpbuf + clen, (void *)iov->iov_base,
+			      iov->iov_len)))
+				ret = -EFAULT;	/* no break */
+			clen += iov->iov_len >> 2;
+			iov++;
+		}
+		ipath_dwordcpy(piobuf, tmpbuf, clen);
+	}
+
+	/* flush the packet out now, don't leave it waiting around */
+	mb();
+
+	if (ret) {
+		/*
+		 * Packet is bad, so we need to use the PIO abort mechanism to
+		 * abort the packet
+		 */
+		uint32_t sendctrl;
+		sendctrl = devdata[t].ipath_sendctrl | INFINIPATH_S_DISARM |
+		    (whichpb << INFINIPATH_S_DISARMPIOBUF_SHIFT);
+		_IPATH_DBG("Doing PIO abort on buffer %u after error\n",
+			   whichpb);
+		ipath_kput_kreg(t, kr_sendctrl, sendctrl);
+	}
+
+done:
+	return ret;
+}
+
+/*
+ * implemention of the ioctl to get the counter values from the chip
+ * For the time being, we get all of them when asked, no shadowing.
+ * We need to shadow the byte counters at a minimum, because otherwise
+ * they will wrap in just a few seconds at full bandwidth
+ * The second argument is the user address to which we do the copy_to_user()
+ */
+static int ipath_get_counters(ipath_type t,
+			      struct infinipath_counters * ucounters)
+{
+	int ret = 0;
+	uint64_t val;
+	uint64_t *ucreg;
+	uint16_t vcreg;
+
+	ucreg = (uint64_t *) ucounters;
+	/*
+	 * for now, let's do this one at a time.  It's not the most
+	 * optimal method, but it is simple, and has no intermediate
+	 * memory requirements.
+	 */
+	for (vcreg = 0;
+	     vcreg < (sizeof(struct infinipath_counters) / sizeof(val));
+	     vcreg++, ucreg++) {
+		ipath_creg creg = vcreg;
+		val = ipath_snap_cntr(t, creg);
+		if ((ret = copy_to_user(ucreg, &val, sizeof(val)))) {
+			_IPATH_DBG("copy_to_user error on counter %d\n", creg);
+			break;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * implemention of the ioctl to get the stats values from the driver
+ * The argument is the user address to which we do the copy_to_user()
+ */
+static int ipath_get_stats(struct infinipath_stats *ustats)
+{
+	int ret = 0;
+
+	if ((ret = copy_to_user(ustats, &ipath_stats, sizeof(ipath_stats))))
+		_IPATH_DBG("copy_to_user error on driver stats\n");
+
+	return ret;
+}
+
+/* set a partition key.  We can have up to 4 active at a time (other than
+ * the default, which is always allowed).  This is somewhat tricky, since
+ * multiple ports may set the same key, so we reference count them, and
+ * clean up at exit.  All 4 partition keys are packed into a single
+ * infinipath register.  It's an error for a process to set the same
+ * pkey multiple times.  We provide no mechanism to de-allocate a pkey
+ * at this time, we may eventually need to do that.
+ * I've used the atomic operations, and no locking, and only make a single
+ * pass through what's available.  This should be more than adequate for
+ * some time. I'll think about spinlocks or the like if and as it's necessary
+ */
+static int ipath_set_partkey(ipath_portdata *pd, uint16_t key)
+{
+	ipath_devdata *dd;
+	int i, any = 0, pidx = -1;
+	uint16_t lkey = key & 0x7FFF;
+
+	dd = &devdata[pd->port_unit];
+
+	if (lkey == (IPS_DEFAULT_P_KEY & 0x7FFF)) {
+		/* nothing to do; this key always valid */
+		return 0;
+	}
+
+	_IPATH_VDBG
+	    ("p%u try to set pkey %hx, current keys %hx:%x %hx:%x %hx:%x %hx:%x\n",
+	     pd->port_port, key, dd->ipath_pkeys[0],
+	     atomic_read(&dd->ipath_pkeyrefs[0]), dd->ipath_pkeys[1],
+	     atomic_read(&dd->ipath_pkeyrefs[1]), dd->ipath_pkeys[2],
+	     atomic_read(&dd->ipath_pkeyrefs[2]), dd->ipath_pkeys[3],
+	     atomic_read(&dd->ipath_pkeyrefs[3]));
+
+	if (!lkey) {
+		_IPATH_PRDBG("p%u tries to set key 0, not allowed\n",
+			     pd->port_port);
+		return -EINVAL;
+	}
+
+	/*
+	 * Set the full membership bit, because it has to be
+	 * set in the register or the packet, and it seems
+	 * cleaner to set in the register than to force all
+	 * callers to set it. (see bug 4331)
+	 */
+	key |= 0x8000;
+
+	for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) {
+		if (!pd->port_pkeys[i] && pidx == -1)
+			pidx = i;
+		if (pd->port_pkeys[i] == key) {
+			_IPATH_VDBG
+			    ("p%u tries to set same pkey (%x) more than once\n",
+			     pd->port_port, key);
+			return -EEXIST;
+		}
+	}
+	if (pidx == -1) {
+		_IPATH_DBG
+		    ("All pkeys for port %u already in use, can't set %x\n",
+		     pd->port_port, key);
+		return -EBUSY;
+	}
+	for (any = i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
+		if (!dd->ipath_pkeys[i]) {
+			any++;
+			continue;
+		}
+		if (dd->ipath_pkeys[i] == key) {
+			if (atomic_inc_return(&dd->ipath_pkeyrefs[i]) > 1) {
+				pd->port_pkeys[pidx] = key;
+				_IPATH_VDBG
+				    ("p%u set key %x matches #%d, count now %d\n",
+				     pd->port_port, key, i,
+				     atomic_read(&dd->ipath_pkeyrefs[i]));
+				return 0;
+			} else {
+				/* lost race, decrement count, catch below */
+				atomic_dec(&dd->ipath_pkeyrefs[i]);
+				_IPATH_VDBG
+				    ("Lost race, count was 0, after dec, it's %d\n",
+				     atomic_read(&dd->ipath_pkeyrefs[i]));
+				any++;
+			}
+		}
+		if ((dd->ipath_pkeys[i] & 0x7FFF) == lkey) {
+			/*
+			 * It makes no sense to have both the limited and full
+			 * membership PKEY set at the same time since the
+			 * unlimited one will disable the limited one.
+			 */
+			return -EEXIST;
+		}
+	}
+	if (!any) {
+		_IPATH_DBG
+		    ("port %u, all pkeys already in use, can't set %x\n",
+		     pd->port_port, key);
+		return -EBUSY;
+	}
+	for (any = i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
+		if (!dd->ipath_pkeys[i] &&
+		    atomic_inc_return(&dd->ipath_pkeyrefs[i]) == 1) {
+			uint64_t pkey;
+
+			/* for ipathstats, etc. */
+			ipath_stats.sps_pkeys[i] = lkey;
+			pd->port_pkeys[pidx] = dd->ipath_pkeys[i] = key;
+			pkey =
+			    (uint64_t) dd->ipath_pkeys[0] |
+			    ((uint64_t) dd->ipath_pkeys[1] << 16) |
+			    ((uint64_t) dd->ipath_pkeys[2] << 32) |
+			    ((uint64_t) dd->ipath_pkeys[3] << 48);
+			_IPATH_PRDBG
+			    ("p%u set key %x in #%d, portidx %d, new pkey reg %llx\n",
+			     pd->port_port, key, i, pidx, pkey);
+			ipath_kput_kreg(pd->port_unit, kr_partitionkey, pkey);
+
+			return 0;
+		}
+	}
+	_IPATH_DBG
+	    ("port %u, all pkeys already in use 2nd pass, can't set %x\n",
+	     pd->port_port, key);
+	return -EBUSY;
+}
+
+/*
+ * stop_start == 0 disables receive on the port, for use in queue overflow
+ * conditions.  stop_start==1 re-enables, and returns value of tail register,
+ * to be used to re-init the software copy of the head register
+ */
+
+static int ipath_manage_rcvq(ipath_portdata * pd, uint16_t start_stop)
+{
+	ipath_devdata *dd;
+	/*
+	 * This needs to be volatile, so that the compiler doesn't
+	 * optimize away the read to the device's mapped memory.
+	 */
+	volatile uint64_t tval;
+
+	dd = &devdata[pd->port_unit];
+	_IPATH_PRDBG("%sabling rcv for unit %u port %u\n",
+		     start_stop ? "en" : "dis", pd->port_unit, pd->port_port);
+	/* atomically clear receive enable port. */
+	if (start_stop) {
+		/*
+		 * on enable, force in-memory copy of the tail register
+		 * to 0, so that protocol code doesn't have to worry
+		 * about whether or not the chip has yet updated
+		 * the in-memory copy or not on return from the system
+		 * call. The chip always resets it's tail register back
+		 * to 0 on a transition from disabled to enabled.
+		 * This could cause a problem if software was broken,
+		 * and did the enable w/o the disable, but eventually
+		 * the in-memory copy will be updated and correct
+		 * itself, even in the face of software bugs.
+		 */
+		*pd->port_rcvhdrtail_kvaddr = 0;
+		atomic_set_mask(1U <<
+				(INFINIPATH_R_PORTENABLE_SHIFT + pd->port_port),
+				&dd->ipath_rcvctrl);
+	} else
+		atomic_clear_mask(1U <<
+				  (INFINIPATH_R_PORTENABLE_SHIFT +
+				   pd->port_port), &dd->ipath_rcvctrl);
+	ipath_kput_kreg(pd->port_unit, kr_rcvctrl, dd->ipath_rcvctrl);
+	/* now be sure chip saw it before we return */
+	tval = ipath_kget_kreg64(pd->port_unit, kr_scratch);
+	if (start_stop) {
+		/*
+		 * and try to be sure that tail reg update has happened
+		 * too.  This should in theory interlock with the RXE
+		 * changes to the tail register.  Don't assign it to
+		 * the tail register in memory copy, since we could
+		 * overwrite an update by the chip if we did.
+		 */
+		tval =
+		    ipath_kget_ureg32(pd->port_unit, ur_rcvhdrtail,
+				      pd->port_port);
+	}
+	/* always; new head should be equal to new tail; see above */
+	return 0;
+}
+
+/*
+ * This routine is now quite different for user and kernel, because
+ * the kernel uses skb's, for the accelerated network performance
+ * This is the user port version
+ *
+ * allocate the eager TID buffers and program them into infinipath
+ * They are no longer completely contiguous, we do multiple
+ * alloc_pages() calls.
+ */
+static int ipath_create_user_egr(ipath_portdata * pd)
+{
+	char *buf;
+	ipath_devdata *dd = &devdata[pd->port_unit];
+	uint64_t *egrbase, egroff, lenvalid;
+	unsigned e, egrcnt, alloced, order, egrperchunk, chunk;
+	unsigned long pa, pent;
+
+	egrcnt = dd->ipath_rcvegrcnt;
+	egroff =
+	    dd->ipath_rcvegrbase + pd->port_port * egrcnt * sizeof(*egrbase);
+	egrbase = (uint64_t *) ((char *)(dd->ipath_kregbase) + egroff);
+	_IPATH_VDBG("Allocating %d egr buffers, at chip offset %llx (%p)\n",
+		    egrcnt, egroff, egrbase);
+
+	/*
+	 * to avoid wasting a lot of memory, we allocate 32KB chunks of
+	 * physically contiguous memory, advance through it until used up
+	 * and then allocate more.  Of course, we need memory to store
+	 * those extra pointers, now.  Started out with 256KB, but under
+	 * heavy memory pressure (creating large files and then copying
+	 * them over NFS while doing lots of MPI jobs), we hit some
+	 * alloc_pages() failures, even though we can sleep...  (2.6.10)
+	 * Still get failures at 64K.  32K is the lowest we can go without
+	 * waiting more memory again.  It seems likely that the coalescing
+	 * in free_pages, etc. still has issues (as it has had previously
+	 * during 2.6.x development).
+	 */
+	order = get_order(0x8000);
+	alloced =
+	    round_up(dd->ipath_rcvegrbufsize * egrcnt,
+		     (1 << order) * PAGE_SIZE);
+	egrperchunk = ((1 << order) * PAGE_SIZE) / dd->ipath_rcvegrbufsize;
+	chunk = (egrcnt + egrperchunk - 1) / egrperchunk;
+	pd->port_rcvegrbuf_chunks = chunk;
+	pd->port_rcvegrbufs_perchunk = egrperchunk;
+	pd->port_rcvegrbuf_order = order;
+	pd->port_rcvegrbuf_pages =
+	    vmalloc(chunk * sizeof(pd->port_rcvegrbuf_pages[0]));
+	pd->port_rcvegrbuf_virt =
+	    vmalloc(chunk * sizeof(pd->port_rcvegrbuf_virt[0]));
+	if (!pd->port_rcvegrbuf_pages || !pd->port_rcvegrbuf_pages) {
+		_IPATH_UNIT_ERROR(pd->port_unit,
+				  "Unable to allocate %u EGR buffer array pointers\n",
+				  chunk);
+		if (pd->port_rcvegrbuf_pages) {
+			vfree(pd->port_rcvegrbuf_pages);
+			pd->port_rcvegrbuf_pages = NULL;
+		}
+		return -ENOMEM;
+	}
+	for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) {
+		/*
+		 * GFP_USER, but without GFP_FS, so buffer cache can
+		 * be coalesced (we hope); otherwise, even at order 4, heavy
+		 * filesystem activity makes these fail
+		 */
+		if (!
+		    (pd->port_rcvegrbuf_pages[e] =
+		     alloc_pages(__GFP_WAIT | __GFP_IO, order))) {
+			_IPATH_UNIT_ERROR(pd->port_unit,
+					  "Unable to allocate EGR buffer array %u/%u\n",
+					  e, pd->port_rcvegrbuf_chunks);
+			vfree(pd->port_rcvegrbuf_pages);
+			pd->port_rcvegrbuf_pages = NULL;
+			vfree(pd->port_rcvegrbuf_virt);
+			pd->port_rcvegrbuf_virt = NULL;
+			return -ENOMEM;
+		}
+	}
+
+	/*
+	 * calculate physical, then phys_to_virt()
+	 * so that we get an address that fits in 64 bits, so we can use
+	 * mmap64 from 32 bit programs on the chip and kernel virtual
+	 * addresses (mmap64 for 32 bit programs on i386 and x86_64
+	 * only has 44 bits of address, because it uses mmap2())
+	 * We do this with the first chunk;  We don't need a kernel
+	 * virtually contiguous address to give the user virtually
+	 * contiguous mappings.  It just complicates the nopage routine
+	 * a little tiny bit ;)
+	 */
+	buf = page_address(pd->port_rcvegrbuf_pages[0]);
+	pa = virt_to_phys(buf);
+	pd->port_rcvegr_phys = pa;
+
+	/* in words */
+	lenvalid = (dd->ipath_rcvegrbufsize - pd->port_egrskip) >> 2;
+	_IPATH_VDBG
+	    ("port%u egrbuf vaddr %p, cpu %d, egrskip %u, len %llx words\n",
+	     pd->port_port, buf, smp_processor_id(), pd->port_egrskip,
+	     lenvalid);
+	lenvalid <<= INFINIPATH_RT_BUFSIZE_SHIFT;
+	lenvalid |= INFINIPATH_RT_VALID;
+
+	for (e = chunk = 0; chunk < pd->port_rcvegrbuf_chunks; chunk++) {
+		int i, n;
+		struct page *p;
+		p = pd->port_rcvegrbuf_pages[chunk];
+		pa = page_to_phys(p);
+		buf = page_address(p);
+		/*
+		 * stash away for later use, since page_address() lookup
+		 * is not cheap
+		 */
+		pd->port_rcvegrbuf_virt[chunk] = buf;
+		if (pa & ~INFINIPATH_RT_ADDR_MASK)
+			_IPATH_INFO
+			    ("physaddr %lx has more than 40 bits, using only 40!\n",
+			     pa);
+		n = 1 << pd->port_rcvegrbuf_order;
+		for (i = 0; i < n; i++)
+			SetPageReserved(virt_to_page(buf + (i * PAGE_SIZE)));
+
+		/* clear buffer for security, sanity, and, debugging */
+		memset(buf, 0, PAGE_SIZE * n);
+
+		for (i = 0; e < egrcnt && i < egrperchunk; e++, i++) {
+			pent =
+			    ((pa +
+			      pd->
+			      port_egrskip) & INFINIPATH_RT_ADDR_MASK) |
+			    lenvalid;
+
+			ipath_kput_memq(pd->port_unit, &egrbase[e], pent);
+			_IPATH_VDBG("egr %u phys %lx val %lx\n", e, pa, pent);
+			pa += dd->ipath_rcvegrbufsize;
+		}
+		yield();	/* don't hog the cpu */
+	}
+
+	return 0;
+}
+
+/*
+ * This routine is now quite different for user and kernel, because
+ * the kernel uses skb's, for the accelerated network performance
+ * This is the kernel (port0) version
+ *
+ * Allocate the eager TID buffers and program them into infinipath.
+ * We use the network layer alloc_skb() allocator to allocate the memory, and
+ * either use the buffers as is for things like SMA packets, or pass
+ * the buffers up to the ipath layered driver and thence the network layer,
+ * replacing them as we do so (see ipath_kreceive())
+ */
+static int ipath_create_port0_egr(ipath_portdata * pd)
+{
+	int ret = 0;
+	uint64_t *egrbase, egroff;
+	unsigned e, egrcnt;
+	ipath_devdata *dd;
+	struct sk_buff **skbs;
+
+	dd = &devdata[pd->port_unit];
+	egrcnt = dd->ipath_rcvegrcnt;
+	egroff =
+	    dd->ipath_rcvegrbase + pd->port_port * egrcnt * sizeof(*egrbase);
+	egrbase = (uint64_t *) ((char *)(dd->ipath_kregbase) + egroff);
+	_IPATH_VDBG
+	    ("unit%u Allocating %d egr buffers, at chip offset %llx (%p)\n",
+	     pd->port_unit, egrcnt, egroff, egrbase);
+
+	skbs = vmalloc(sizeof(*dd->ipath_port0_skbs) * egrcnt);
+	if (skbs == NULL)
+		ret = -ENOMEM;
+	else {
+		for (e = 0; e < egrcnt; e++) {
+			/*
+			 * This is a bit tricky in that we allocate
+			 * extra space for 2 bytes of the 14 byte
+			 * ethernet header.  These two bytes are passed
+			 * in the ipath header so the rest of the data
+			 * is word aligned.  We allocate 4 bytes so that the
+			 * data buffer stays word aligned.
+			 * See ipath_kreceive() for more details.
+			 */
+			skbs[e] =
+			    __dev_alloc_skb(dd->ipath_ibmaxlen + 4, GFP_KERNEL);
+			if (skbs[e] == NULL) {
+				_IPATH_UNIT_ERROR(pd->port_unit,
+						  "SKB allocation error for eager TID %u\n",
+						  e);
+				while (e != 0)
+					dev_kfree_skb(skbs[--e]);
+				ret = -ENOMEM;
+				break;
+			}
+			skb_reserve(skbs[e], 4);
+		}
+	}
+	/*
+	 * after loop above, so we can test non-NULL
+	 * to see if ready to use at receive, etc.  Hope this fixes some
+	 * panics.
+	 */
+	dd->ipath_port0_skbs = skbs;
+
+	/*
+	 * have to tell chip each time we init it
+	 * even if we are re-using previous memory.
+	 */
+	if (!ret) {
+		uint64_t lenvalid;	/* in words */
+
+		lenvalid = (dd->ipath_ibmaxlen - pd->port_egrskip) >> 2;
+		lenvalid <<= INFINIPATH_RT_BUFSIZE_SHIFT;
+		lenvalid |= INFINIPATH_RT_VALID;
+		for (e = 0; e < egrcnt; e++) {
+			unsigned long pa, pent;
+
+			pa = virt_to_phys(dd->ipath_port0_skbs[e]->data);
+			pa += pd->port_egrskip;
+			if (!e && (pa & ~INFINIPATH_RT_ADDR_MASK))
+				_IPATH_INFO
+				    ("phys addr %lx has more than 40 bits, using only 40!!!\n",
+				     pa);
+			pent = (pa & INFINIPATH_RT_ADDR_MASK) | lenvalid;
+			/*
+			 * don't need this except extreme debugging,
+			 * but leaving to save future typing.
+			 * _IPATH_VDBG("egr[%d] %p <- %lx\n", e, &egrbase[e], pent);
+			 */
+			ipath_kput_memq(pd->port_unit, &egrbase[e], pent);
+		}
+		yield();	/* don't hog the cpu */
+	}
+
+	return ret;
+}
+
+/*
+ * this *must* be physically contiguous memory, and for now,
+ * that limits it to what kmalloc can do.
+ */
+static int ipath_create_rcvhdrq(ipath_portdata * pd)
+{
+	int i, ret = 0, amt, order, pgs;
+	char *qt;
+	struct page *p;
+	unsigned long pa, pa0;
+
+	amt = round_up(devdata[pd->port_unit].ipath_rcvhdrcnt
+		       * devdata[pd->port_unit].ipath_rcvhdrentsize *
+		       sizeof(uint32_t), PAGE_SIZE);
+	if (!pd->port_rcvhdrq) {
+		order = get_order(amt);
+		/*
+		 * not using REPEAT isn't viable; at 128KB, we can easily fail
+		 * this.  The problem with REPEAT is we can block here
+		 * "forever".  There isn't an inbetween, unfortunately.
+		 * We could reduce the risk by never freeing the rcvhdrq
+		 * except at unload, but even then, the first time a
+		 * port is used, we could delay for some time...
+		 */
+		p = alloc_pages(GFP_USER, order);
+		if (!p) {
+			_IPATH_UNIT_ERROR(pd->port_unit,
+					  "attempt to allocate order %u memory for port %u rcvhdrq failed\n",
+					  order, pd->port_port);
+			return -ENOMEM;
+		}
+
+		/*
+		 * should use kmap (and later kunmap), even though high mem will
+		 * always be mapped on x86_64, to play it safe, but for some
+		 * bizarre reason these aren't exported symbols...
+		 */
+		pd->port_rcvhdrq = page_address(p);
+		if (!virt_addr_valid(pd->port_rcvhdrq)) {
+			_IPATH_DBG
+			    ("weird, virt_addr_valid false right after alloc_pages\n");
+			_IPATH_DBG("__pa(%p) is %lx, num_physpages %lx\n",
+				   pd->port_rcvhdrq, __pa(pd->port_rcvhdrq),
+				   num_physpages);
+		}
+		pd->port_rcvhdrq_phys = virt_to_phys(pd->port_rcvhdrq);
+		pd->port_rcvhdrq_order = order;
+
+		pa0 = pd->port_rcvhdrq_phys;
+		pgs = amt >> PAGE_SHIFT;
+		_IPATH_VDBG
+		    ("%d pages at %p (phys %lx) order=%u for port %u rcvhdr Q\n",
+		     pgs, pd->port_rcvhdrq, pa0, pd->port_rcvhdrq_order,
+		     pd->port_port);
+
+		/*
+		 * verify it's really physically contiguous, to be paranoid
+		 * also mark pages as reserved, to avoid problems when
+		 * user process with them mapped then exits.
+		 */
+		qt = pd->port_rcvhdrq;
+		SetPageReserved(virt_to_page(qt));
+		qt += PAGE_SIZE;
+		for (pa = pa0, i = 1; i < pgs; i++, qt += PAGE_SIZE) {
+			SetPageReserved(virt_to_page(qt));
+			pa = virt_to_phys(qt);
+			if (pa != (pa0 + (i * PAGE_SIZE)))
+				_IPATH_INFO
+				    ("pg %d at %p phys %lx not contiguous\n", i,
+				     qt, pa);
+			else
+				_IPATH_VDBG("pg %d at %p phys %lx\n", i, qt,
+					    pa);
+		}
+	}
+
+	/*
+	 * clear for security, sanity, and/or debugging (each time we
+	 * use/reuse)
+	 */
+	memset(pd->port_rcvhdrq, 0, amt);
+
+	/*
+	 * tell chip each time we init it, even if we are re-using previous
+	 * memory (we zero it at process close)
+	 */
+	_IPATH_VDBG("writing port %d rcvhdraddr as %lx\n", pd->port_port,
+		    pd->port_rcvhdrq_phys);
+	ipath_kput_kreg_port(pd->port_unit, kr_rcvhdraddr, pd->port_port,
+			     pd->port_rcvhdrq_phys);
+
+	return ret;
+}
+
+#ifdef _IPATH_EXTRA_DEBUG
+/*
+ * occasionally useful to dump the full set of kernel registers for debugging.
+ */
+static void ipath_dump_allregs(char *what, ipath_type t)
+{
+	uint16_t reg;
+	_IPATH_DBG("%s\n", what);
+	for (reg = 0; reg <= 0x100; reg++) {
+		uint64_t v = ipath_kget_kreg64(t, reg);
+		if (!(reg % 4))
+			printk("\n%3x: ", reg);
+		printk("%16llx  ", v);
+	}
+	printk("\n");
+}
+#endif				/* _IPATH_EXTRA_DEBUG */
+
+/*
+ * Do the actual initialization sequence on the chip.  For the real
+ * hardware, this is done from the init routine called from the PCI
+ * infrastructure.
+ */
+int ipath_init_chip(const ipath_type t)
+{
+	int ret = 0, i;
+	uint32_t val32, kpiobufs;
+	uint64_t val, atmp;
+	volatile uint32_t *piobuf;
+	uint32_t pioincr;
+	ipath_devdata *dd = &devdata[t];
+	ipath_portdata *pd;
+	struct page *vpage;
+	char boardn[32];
+
+	/* first time only, set after static version info */
+	if (!chip_driver_version) {
+		i = strlen(ipath_core_version);
+		chip_driver_version = ipath_core_version + i;
+		chip_driver_size = sizeof ipath_core_version - i;
+	}
+
+	/*
+	 * have to clear shadow copies of registers at init that are not
+	 * otherwise set here, or all kinds of bizarre things happen with
+	 * driver on chip reset
+	 */
+	dd->ipath_rcvhdrsize = 0;
+
+	/*
+	 * don't clear ipath_flags as 8bit mode was set before entering
+	 * this func.  However, we do set the linkstate to unknown
+	 */
+
+	/* so we can watch for a transition */
+	dd->ipath_flags |= IPATH_LINKUNK;
+	dd->ipath_flags &= ~(IPATH_LINKACTIVE | IPATH_LINKARMED | IPATH_LINKDOWN
+			     | IPATH_LINKINIT);
+
+	_IPATH_VDBG("Try to read spc chip revision\n");
+	dd->ipath_revision = ipath_kget_kreg64(t, kr_revision);
+
+	/*
+	 * set up fundamental info we need to use the chip;  we assume if
+	 * the revision reg and these regs are OK, we don't need to special
+	 * case the rest
+	 */
+	dd->ipath_sregbase = ipath_kget_kreg32(t, kr_sendregbase);
+	dd->ipath_cregbase = ipath_kget_kreg32(t, kr_counterregbase);
+	dd->ipath_uregbase = ipath_kget_kreg32(t, kr_userregbase);
+	_IPATH_VDBG("ipath_kregbase %p, sendbase %x usrbase %x, cntrbase %x\n",
+		    dd->ipath_kregbase, dd->ipath_sregbase, dd->ipath_uregbase,
+		    dd->ipath_cregbase);
+	if ((dd->ipath_revision & 0xffffffff) == 0xffffffff ||
+	    (dd->ipath_sregbase & 0xffffffff) == 0xffffffff ||
+	    (dd->ipath_cregbase & 0xffffffff) == 0xffffffff ||
+	    (dd->ipath_uregbase & 0xffffffff) == 0xffffffff) {
+		_IPATH_UNIT_ERROR(t,
+				  "Register read failures from chip, giving up initialization\n");
+		ret = -ENODEV;
+		goto done;
+	}
+
+	/* clear the initial reset flag, in case first driver load */
+	ipath_kput_kreg(t, kr_errorclear, INFINIPATH_E_RESET);
+
+	dd->ipath_portcnt = ipath_kget_kreg32(t, kr_portcnt);
+	if (!infinipath_cfgports)
+		dd->ipath_cfgports = dd->ipath_portcnt;
+	else if (infinipath_cfgports <= dd->ipath_portcnt) {
+		dd->ipath_cfgports = infinipath_cfgports;
+		_IPATH_DBG("Configured to use %u ports out of %u in chip\n",
+			   dd->ipath_cfgports, dd->ipath_portcnt);
+	} else {
+		dd->ipath_cfgports = dd->ipath_portcnt;
+		_IPATH_DBG
+		    ("Tried to configured to use %u ports; chip only supports %u\n",
+		     infinipath_cfgports, dd->ipath_portcnt);
+	}
+	dd->ipath_pd = kmalloc(sizeof(*dd->ipath_pd) * dd->ipath_cfgports,
+			       GFP_KERNEL);
+	if (!dd->ipath_pd) {
+		_IPATH_UNIT_ERROR(t,
+				  "Unable to allocate portdata array, failing\n");
+		ret = -ENOMEM;
+		goto done;
+	}
+	memset(dd->ipath_pd, 0, sizeof(*dd->ipath_pd) * dd->ipath_cfgports);
+
+	dd->ipath_lastegrheads = kmalloc(sizeof(*dd->ipath_lastegrheads)
+					 * dd->ipath_cfgports, GFP_KERNEL);
+	dd->ipath_lastrcvhdrqtails = kmalloc(sizeof(*dd->ipath_lastrcvhdrqtails)
+					     * dd->ipath_cfgports, GFP_KERNEL);
+	if (!dd->ipath_lastegrheads || !dd->ipath_lastrcvhdrqtails) {
+		_IPATH_UNIT_ERROR(t,
+				  "Unable to allocate head arrays, failing\n");
+		ret = -ENOMEM;
+		goto done;
+	}
+	memset(dd->ipath_lastrcvhdrqtails, 0,
+	       sizeof(*dd->ipath_lastrcvhdrqtails)
+	       * dd->ipath_cfgports);
+	memset(dd->ipath_lastegrheads, 0, sizeof(*dd->ipath_lastegrheads)
+	       * dd->ipath_cfgports);
+
+	dd->ipath_pd[0] = kmalloc(sizeof(ipath_portdata), GFP_KERNEL);
+	if (!dd->ipath_pd[0]) {
+		_IPATH_UNIT_ERROR(t,
+				  "Unable to allocate portdata for port 0, failing\n");
+		ret = -ENOMEM;
+		goto done;
+	}
+	memset(dd->ipath_pd[0], 0, sizeof(ipath_portdata));
+
+	pd = dd->ipath_pd[0];
+	pd->port_unit = t;
+	pd->port_port = 0;
+	pd->port_cnt = 1;
+	/* The port 0 pkey table is used by the layer interface. */
+	pd->port_pkeys[0] = IPS_DEFAULT_P_KEY;
+
+	dd->ipath_rcvtidcnt = ipath_kget_kreg32(t, kr_rcvtidcnt);
+	dd->ipath_rcvtidbase = ipath_kget_kreg32(t, kr_rcvtidbase);
+	dd->ipath_rcvegrcnt = ipath_kget_kreg32(t, kr_rcvegrcnt);
+	dd->ipath_rcvegrbase = ipath_kget_kreg32(t, kr_rcvegrbase);
+	dd->ipath_palign = ipath_kget_kreg32(t, kr_pagealign);
+	dd->ipath_piobufbase = ipath_kget_kreg32(t, kr_sendpiobufbase);
+	dd->ipath_piosize = ipath_kget_kreg32(t, kr_sendpiosize);
+	dd->ipath_ibmtu = 4096;	/* default to largest legal MTU */
+	dd->ipath_piobcnt = ipath_kget_kreg32(t, kr_sendpiobufcnt);
+
+	_IPATH_VDBG
+	    ("Revision %llx (PCI %x), %u ports, %u tids, %u egrtids, %u piobufs\n",
+	     dd->ipath_revision, dd->ipath_pcirev, dd->ipath_portcnt,
+	     dd->ipath_rcvtidcnt, dd->ipath_rcvegrcnt, dd->ipath_piobcnt);
+
+	if (((dd->ipath_revision >> INFINIPATH_R_SOFTWARE_SHIFT) & INFINIPATH_R_SOFTWARE_MASK) != IPATH_CHIP_SWVERSION) {	/* >= maybe, someday */
+		_IPATH_UNIT_ERROR(t,
+				  "Driver only handles version %d, chip swversion is %d (%llx), failng\n",
+				  IPATH_CHIP_SWVERSION,
+				  (int)(dd->
+					ipath_revision >>
+					INFINIPATH_R_SOFTWARE_SHIFT) &
+				  INFINIPATH_R_SOFTWARE_MASK,
+				  dd->ipath_revision);
+		ret = -ENOSYS;
+		goto done;
+	}
+	dd->ipath_majrev = (uint8_t) ((dd->ipath_revision >>
+				       INFINIPATH_R_CHIPREVMAJOR_SHIFT) &
+				      INFINIPATH_R_CHIPREVMAJOR_MASK);
+	dd->ipath_minrev =
+	    (uint8_t) ((dd->
+			ipath_revision >> INFINIPATH_R_CHIPREVMINOR_SHIFT) &
+		       INFINIPATH_R_CHIPREVMINOR_MASK);
+	dd->ipath_boardrev =
+	    (uint8_t) ((dd->
+			ipath_revision >> INFINIPATH_R_BOARDID_SHIFT) &
+		       INFINIPATH_R_BOARDID_MASK);
+
+	ipath_get_boardname(t, boardn, sizeof boardn);
+
+	{
+		snprintf(chip_driver_version, chip_driver_size,
+			 "Driver %u.%u, %s, InfiniPath%u %u.%u, PCI %u, SW Compat %u\n",
+			 IPATH_CHIP_VERS_MAJ, IPATH_CHIP_VERS_MIN, boardn,
+			 (unsigned)(dd->
+				    ipath_revision >> INFINIPATH_R_ARCH_SHIFT) &
+			 INFINIPATH_R_ARCH_MASK, dd->ipath_majrev,
+			 dd->ipath_minrev, dd->ipath_pcirev,
+			 (unsigned)(dd->
+				    ipath_revision >>
+				    INFINIPATH_R_SOFTWARE_SHIFT) &
+			 INFINIPATH_R_SOFTWARE_MASK);
+
+	}
+
+	_IPATH_DBG("%s", chip_driver_version);
+
+	/*
+	 * we ignore most issues after reporting them, but have to specially
+	 * handle hardware-disabled chips.
+	 */
+	if(ipath_validate_rev(dd) == 2) {
+		ret = -EPERM; /* unique error, known to infinipath_init_one() */
+		goto done;
+	}
+
+	/*
+	 * zero all the TID entries at startup.  We do this for sanity,
+	 * in case of a previous driver crash of some kind, and also
+	 * because the chip powers up with these memories in an unknown
+	 * state.  Use portcnt, not cfgports, since this is for the full chip,
+	 * not for current (possibly different) configuration value
+	 * Chip Errata bug 6447
+	 */
+	for (val32 = 0; val32 < dd->ipath_portcnt; val32++)
+		ipath_clear_tids(t, val32);
+
+	dd->ipath_rcvhdrentsize = IPATH_RCVHDRENTSIZE;
+	/* we could bump this
+	 * to allow for full rcvegrcnt + rcvtidcnt, but then it no
+	 * longer nicely fits power of two, and since we now use
+	 * alloc_pages, the rest would be wasted.
+	 */
+	dd->ipath_rcvhdrcnt = dd->ipath_rcvegrcnt;
+	/*
+	 * setup offset of last valid entry in rcvhdrq, for various tests, to
+	 * avoid calculating each time we need it
+	 */
+	dd->ipath_hdrqlast =
+	    dd->ipath_rcvhdrentsize * (dd->ipath_rcvhdrcnt - 1);
+	ipath_kput_kreg(t, kr_rcvhdrentsize, dd->ipath_rcvhdrentsize);
+	ipath_kput_kreg(t, kr_rcvhdrcnt, dd->ipath_rcvhdrcnt);
+	/*
+	 * not in ipath_rcvhdrsize, so user programs can set differently, but
+	 * so any early packets see the default size.
+	 */
+	ipath_kput_kreg(t, kr_rcvhdrsize, IPATH_DFLT_RCVHDRSIZE);
+
+	/*
+	 * we "know" that this works
+	 * out OK.  It's actually a bit more than we need, but 2048+64 isn't
+	 * quite enough for full size, and we want the +N to be a power of 2
+	 * to give us reasonable alignment and fit within page_alloc()'ed
+	 * memory
+	 */
+	dd->ipath_rcvegrbufsize = dd->ipath_piosize;
+
+	/*
+	 * the min() check here is currently a nop, but it may not always be,
+	 * depending on just how we do ipath_rcvegrbufsize
+	 */
+	dd->ipath_ibmaxlen = min(dd->ipath_piosize, dd->ipath_rcvegrbufsize);
+	dd->ipath_init_ibmaxlen = dd->ipath_ibmaxlen;
+
+	/*
+	 * set up the shadow copies of the piobufavail registers, which
+	 * we compare against the chip registers for now, and the in
+	 * memory DMA'ed copies of the registers.  This has to be done
+	 * early, before we calculate lastport, etc.
+	 */
+	val = dd->ipath_piobcnt;
+	/*
+	 * calc number of pioavail registers, and save it; we have 2 bits
+	 * per buffer
+	 */
+	dd->ipath_pioavregs =
+	    round_up(val, sizeof(uint64_t) * _BITS_PER_BYTE / 2) /
+	    (sizeof(uint64_t) * _BITS_PER_BYTE / 2);
+	if (dd->ipath_pioavregs >
+	    (sizeof(dd->ipath_pioavailshadow) /
+	     sizeof(dd->ipath_pioavailshadow[0]))) {
+		dd->ipath_pioavregs =
+		    sizeof(dd->ipath_pioavailshadow) /
+		    sizeof(dd->ipath_pioavailshadow[0]);
+		dd->ipath_piobcnt = dd->ipath_pioavregs * sizeof(uint64_t) * _BITS_PER_BYTE >> 1;	/* 2 bits/reg */
+		_IPATH_INFO
+		    ("Warning: %lld piobufs is too many to fit in shadow, only using %d\n",
+		     val, dd->ipath_piobcnt);
+	}
+
+	if (!infinipath_kpiobufs) {
+		/* have to have at least one, for SMA */
+		kpiobufs = infinipath_kpiobufs = 1;
+	} else if (dd->ipath_piobcnt <
+		   (dd->ipath_cfgports * IPATH_MIN_USER_PORT_BUFCNT)) {
+		_IPATH_INFO
+		    ("Too few PIO buffers (%u) for %u ports to have %u each!\n",
+		     dd->ipath_piobcnt, dd->ipath_cfgports,
+		     IPATH_MIN_USER_PORT_BUFCNT);
+		kpiobufs = 1;	/* reserve just the minimum for SMA/ether */
+	} else
+		kpiobufs = infinipath_kpiobufs;
+
+	if (kpiobufs >
+	    (dd->ipath_piobcnt -
+	     (dd->ipath_cfgports * IPATH_MIN_USER_PORT_BUFCNT))) {
+		i = dd->ipath_piobcnt -
+		    (dd->ipath_cfgports * IPATH_MIN_USER_PORT_BUFCNT);
+		if (i < 0)
+			i = 0;
+		_IPATH_INFO
+		    ("Allocating %d PIO bufs for kernel leaves too few for %d user ports (%d each); using %u\n",
+		     kpiobufs, dd->ipath_cfgports - 1,
+		     IPATH_MIN_USER_PORT_BUFCNT, i);
+		/*
+		 * shouldn't change infinipath_kpiobufs, because could be
+		 * different for different devices...
+		 */
+		kpiobufs = i;
+	}
+	dd->ipath_lastport_piobuf = dd->ipath_piobcnt - kpiobufs;
+	dd->ipath_pbufsport = dd->ipath_cfgports > 1 ?
+	    dd->ipath_lastport_piobuf / (dd->ipath_cfgports - 1) : 0;
+	val32 = dd->ipath_lastport_piobuf -
+	    (dd->ipath_pbufsport * (dd->ipath_cfgports - 1));
+	if (val32 > 0) {
+		_IPATH_DBG
+		    ("allocating %u pbufs/port leaves %u unused, add to kernel\n",
+		     dd->ipath_pbufsport, val32);
+		dd->ipath_lastport_piobuf -= val32;
+		_IPATH_DBG("%u pbufs/port leaves %u unused, add to kernel\n",
+			   dd->ipath_pbufsport, val32);
+	}
+	dd->ipath_lastpioindex = dd->ipath_lastport_piobuf;
+	_IPATH_VDBG
+	    ("%d PIO bufs %u - %u, %u each for %u user ports\n",
+	     kpiobufs, dd->ipath_lastport_piobuf, dd->ipath_piobcnt, dd->ipath_pbufsport,
+	     dd->ipath_cfgports - 1);
+
+	/*
+	 * this has to be page aligned, and on a page of it's own, so we
+	 * can map it into user space.  We also use it to give processes
+	 * a copy of ipath_statusp, on a separate cacheline, followed by
+	 * a copy of the freeze error string, if it's happened.  Might also
+	 * use that space for other things.
+	 */
+	val = round_up(2 * L1_CACHE_BYTES + sizeof(*dd->ipath_statusp) +
+		       dd->ipath_pioavregs * sizeof(uint64_t), 2 * PAGE_SIZE);
+	if (!(dd->ipath_pioavailregs_dma = kmalloc(val * sizeof(uint64_t),
+						   GFP_KERNEL))) {
+		_IPATH_UNIT_ERROR(t,
+				  "failed to allocate PIOavail reg area in memory\n");
+		ret = -ENOMEM;
+		goto done;
+	}
+	if ((PAGE_SIZE - 1) & (uint64_t) dd->ipath_pioavailregs_dma) {
+		dd->__ipath_pioavailregs_base = dd->ipath_pioavailregs_dma;
+		dd->ipath_pioavailregs_dma = (uint64_t *)
+		    round_up((uint64_t) dd->ipath_pioavailregs_dma, PAGE_SIZE);
+	} else
+		dd->__ipath_pioavailregs_base = dd->ipath_pioavailregs_dma;
+	/*
+	 * zero initial, since whole thing mapped
+	 * into user space, and don't want info leak, or confusing garbage
+	 */
+	memset((void *)dd->ipath_pioavailregs_dma, 0, PAGE_SIZE);
+
+	/*
+	 * we really want L2 cache aligned, but for current CPUs of interest,
+	 * they are the same.
+	 */
+	dd->ipath_statusp = (uint64_t *) ((char *)dd->ipath_pioavailregs_dma +
+					  ((2 * L1_CACHE_BYTES +
+					    dd->ipath_pioavregs *
+					    sizeof(uint64_t)) &
+					   ~L1_CACHE_BYTES));
+	/* copy the current value now that it's really allocated */
+	*dd->ipath_statusp = dd->_ipath_status;
+	/*
+	 * setup buffer to hold freeze msg, accessible to apps, following
+	 * statusp
+	 */
+	dd->ipath_freezemsg = (char *)&dd->ipath_statusp[1];
+	/* and it's length */
+	dd->ipath_freezelen = L1_CACHE_BYTES - sizeof(dd->ipath_statusp[0]);
+
+	atmp = virt_to_phys(dd->ipath_pioavailregs_dma);
+	/* stash physical address for user progs */
+	dd->ipath_pioavailregs_phys = atmp;
+	(void)ipath_kput_kreg(t, kr_sendpioavailaddr, atmp);
+	/*
+	 * this is to detect s/w errors, which the h/w works around by
+	 * ignoring the low 6 bits of address, if it wasn't aligned.
+	 */
+	val = ipath_kget_kreg64(t, kr_sendpioavailaddr);
+	if (val != atmp) {
+		_IPATH_UNIT_ERROR(t,
+				  "Catastrophic software error, SendPIOAvailAddr written as %llx, read back as %llx\n",
+				  atmp, val);
+		ret = -EINVAL;
+		goto done;
+	}
+
+	if (t * 64 > (sizeof(ipath_port0_rcvhdrtail) - 64)) {
+		_IPATH_UNIT_ERROR(t,
+				  "unit %u too large for port 0 rcvhdrtail buffer size\n",
+				  t);
+		ret = -ENODEV;
+	}
+
+	/*
+	 * kernel modules loaded into vmalloc'ed memory,
+	 * verify that when we assume that, map to phys, and back to virt,
+	 * that we get the right contents, so we did the mapping right.
+	 */
+	vpage = vmalloc_to_page((void *)ipath_port0_rcvhdrtail);
+	if (vpage == NOPAGE_SIGBUS || vpage == NOPAGE_OOM) {
+		_IPATH_UNIT_ERROR(t, "vmalloc_to_page for rcvhdrtail fails!\n");
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	/*
+	 * 64 is driven by cache line size, and also by chip requirement
+	 * that low 6 bits be 0
+	 */
+	val = page_to_phys(vpage) + t * 64;
+
+	/* verify that the alignment requirement was met */
+	ipath_kput_kreg_port(t, kr_rcvhdrtailaddr, 0, val);
+	atmp = ipath_kget_kreg64_port(t, kr_rcvhdrtailaddr, 0);
+	if (val != atmp) {
+		_IPATH_UNIT_ERROR(t,
+				  "Catastrophic software error, RcvHdrTailAddr0 written as %llx, read back as %llx from %x\n",
+				  val, atmp, kr_rcvhdrtailaddr);
+		ret = -EINVAL;
+		goto done;
+	}
+	/* so we can get current tail in ipath_kreceive(), per chip */
+	dd->ipath_hdrqtailptr =
+	    &ipath_port0_rcvhdrtail[t *
+				    (64 / sizeof(ipath_port0_rcvhdrtail[0]))];
+
+	ipath_kput_kreg(t, kr_rcvbthqp, IPATH_KD_QP);
+
+	/*
+	 * make sure we are not in freeze, and PIO send enabled, so
+	 * writes to pbc happen
+	 */
+	ipath_kput_kreg(t, kr_hwerrmask, 0ULL);
+	ipath_kput_kreg(t, kr_hwerrclear, ~0ULL);
+	ipath_kput_kreg(t, kr_control, 0ULL);
+	ipath_kput_kreg(t, kr_sendctrl, INFINIPATH_S_PIOENABLE);
+
+	/*
+	 * write the pbc of each buffer, to be sure it's initialized, then
+	 * cancel all the buffers, and also abort any packets that might
+	 * have been in flight for some reason (the latter is for driver
+	 * unload/reload, but isn't a bad idea at first init).
+	 * PIO send isn't enabled at this point, so there is no danger
+	 * of sending these out on the wire.
+	 * Chip Errata bug 6610
+	 */
+	piobuf = (uint32_t *) (((char *)(dd->ipath_kregbase)) +
+			       dd->ipath_piobufbase);
+	pioincr = devdata[t].ipath_palign / sizeof(*piobuf);
+	for (i = 0; i < dd->ipath_piobcnt; i++) {
+		*piobuf = 16;	/* reasonable word count, just to init pbc */
+		piobuf += pioincr;
+	}
+	/* self-clearing */
+	ipath_kput_kreg(t, kr_sendctrl, INFINIPATH_S_ABORT);
+
+	/*
+	 * before error clears, since we expect serdes pll errors during
+	 * this, the first time after reset
+	 */
+	if (ipath_bringup_link(t)) {
+		_IPATH_INFO("Failed to bringup IB link\n");
+		ret = -ENETDOWN;
+		goto done;
+	}
+
+	/*
+	 * clear any "expected" hwerrs from reset and/or initialization
+	 * clear any that aren't enabled (at least this once), and then
+	 * set the enable mask
+	 */
+	ipath_clear_init_hwerrs(t);
+	ipath_kput_kreg(t, kr_hwerrclear, ~0ULL);
+	ipath_kput_kreg(t, kr_hwerrmask, dd->ipath_hwerrmask);
+
+	dd->ipath_maskederrs = dd->ipath_ignorederrs;
+	ipath_kput_kreg(t, kr_errorclear, ~0ULL);	/* clear all */
+	/* enable errors that are masked, at least this first time. */
+	ipath_kput_kreg(t, kr_errormask, ~dd->ipath_maskederrs);
+	/* clear any interrups up to this point (ints still not enabled) */
+	ipath_kput_kreg(t, kr_intclear, ~0ULL);
+
+	ipath_stats.sps_lid[t] = dd->ipath_lid;
+
+	/*
+	 * allocate the shadow TID array, so we can ipath_munlock
+	 * previous entries. It make make more sense to move the pageshadow
+	 * to the port data structure, so we only allocate memory for ports
+	 * actually in use, since we at 8k per port, now
+	 */
+	dd->ipath_pageshadow = (struct page **)
+	    vmalloc(dd->ipath_cfgports * dd->ipath_rcvtidcnt *
+		    sizeof(struct page *));
+	if (!dd->ipath_pageshadow)
+		_IPATH_UNIT_ERROR(t,
+				  "failed to allocate shadow page * array, no expected sends!\n");
+	else
+		memset(dd->ipath_pageshadow, 0,
+		       dd->ipath_cfgports * dd->ipath_rcvtidcnt *
+		       sizeof(struct page *));
+
+	/* set up the port 0 (kernel) rcvhdr q and egr TIDs */
+	if (!(ret = ipath_create_rcvhdrq(dd->ipath_pd[0])))
+		ret = ipath_create_port0_egr(dd->ipath_pd[0]);
+	if (ret)
+		_IPATH_UNIT_ERROR(t,
+				  "failed to allocate port 0 (kernel) rcvhdrq and/or egr bufs\n");
+	else {
+		init_waitqueue_head(&ipath_sma_wait);
+		init_waitqueue_head(&ipath_sma_state_wait);
+
+		ipath_kput_kreg(pd->port_unit, kr_rcvctrl, dd->ipath_rcvctrl);
+
+		ipath_kput_kreg(t, kr_rcvbthqp, IPATH_KD_QP);
+
+		/* Enable PIO send, and update of PIOavail regs to memory. */
+		dd->ipath_sendctrl = INFINIPATH_S_PIOENABLE
+		    | INFINIPATH_S_PIOBUFAVAILUPD;
+		ipath_kput_kreg(t, kr_sendctrl, dd->ipath_sendctrl);
+
+		/*
+		 * enable port 0 receive, and receive interrupt
+		 * other ports done as user opens and inits them
+		 */
+		dd->ipath_rcvctrl = INFINIPATH_R_TAILUPD |
+		    (1ULL << INFINIPATH_R_PORTENABLE_SHIFT) |
+		    (1ULL << INFINIPATH_R_INTRAVAIL_SHIFT);
+		ipath_kput_kreg(t, kr_rcvctrl, dd->ipath_rcvctrl);
+
+		/*
+		 * now ready for use
+		 * this should be cleared whenever we detect a reset, or
+		 * initiate one.
+		 */
+		dd->ipath_flags |= IPATH_INITTED;
+
+		/*
+		 * init our shadow copies of head from tail values, and write
+		 * head values to match
+		 */
+		val32 = ipath_kget_ureg32(t, ur_rcvegrindextail, 0);
+		(void)ipath_kput_ureg(t, ur_rcvegrindexhead, val32, 0);
+		dd->ipath_port0head = ipath_kget_ureg32(t, ur_rcvhdrtail, 0);
+		(void)ipath_kput_ureg(t, ur_rcvhdrhead, dd->ipath_port0head, 0);
+
+		/*
+		 * by now pioavail updates to memory should have occurred,
+		 * so copy them into our working/shadow registers; this is
+		 * in case something went wrong with abort, but mostly to
+		 * get the initial values of the generation bit correct
+		 */
+		for (i = 0; i < dd->ipath_pioavregs; i++) {
+			/*
+			 * Chip Errata bug 6641; even and odd qwords>3
+			 * are swapped
+			 */
+			if (i > 3) {
+				if (i & 1)
+					dd->ipath_pioavailshadow[i] =
+					    dd->ipath_pioavailregs_dma[i - 1];
+				else
+					dd->ipath_pioavailshadow[i] =
+					    dd->ipath_pioavailregs_dma[i + 1];
+			} else
+				dd->ipath_pioavailshadow[i] =
+				    dd->ipath_pioavailregs_dma[i];
+		}
+		/* can get counters, stats, etc. */
+		dd->ipath_flags |= IPATH_PRESENT;
+	}
+
+	/*
+	 * cause retrigger of pending interrupts ignored during init, even if
+	 * we had errors
+	 */
+	ipath_kput_kreg(t, kr_intclear, 0ULL);
+
+	/*
+	 * set up stats retrieval timer, even if we had errors in last
+	 * portion of setup
+	 */
+	init_timer(&dd->ipath_stats_timer);
+	dd->ipath_stats_timer.function = ipath_get_faststats;
+	dd->ipath_stats_timer.data = (unsigned long)t;
+	/* every 5 seconds; */
+	dd->ipath_stats_timer.expires = jiffies + 5 * HZ;
+	/* takes ~16 seconds to overflow at full IB 4x bandwdith */
+	add_timer(&dd->ipath_stats_timer);
+
+	dd->ipath_stats_timer_active = 1;
+
+done:
+	if (!ret) {
+		ipath_get_guid(t);
+		*dd->ipath_statusp |= IPATH_STATUS_CHIP_PRESENT;
+		if (!ipath_sma_data_spare) {
+			/* first init, setup SMA data structs */
+			ipath_sma_data_spare =
+			    ipath_sma_data_bufs[IPATH_NUM_SMAPKTS];
+			for (i = 0; i < IPATH_NUM_SMAPKTS; i++)
+				ipath_sma_data[i].buf = ipath_sma_data_bufs[i];
+		}
+		/*
+		 * sps_nports is a global, so, we set it to the highest
+		 * number of ports of any of the chips we find; we never
+		 * decrement it, at least for now.
+		 */
+		if (dd->ipath_cfgports > ipath_stats.sps_nports)
+			ipath_stats.sps_nports = dd->ipath_cfgports;
+	}
+	/* if ret is non-zero, we probably should do some cleanup here... */
+	return ret;
+}
+
+int ipath_waitfor_complete(const ipath_type t, ipath_kreg reg_id,
+			   uint64_t bits_to_wait_for, uint64_t * valp)
+{
+	uint64_t timeout, lastval, val;
+
+	lastval = ipath_kget_kreg64(t, reg_id);
+	timeout = get_cycles() + 0x10000000ULL;	/* <- ridiculously long time */
+	do {
+		val = ipath_kget_kreg64(t, reg_id);
+		*valp = val;	/* so they have something, even on failures. */
+		if ((val & bits_to_wait_for) == bits_to_wait_for)
+			return 0;
+		if (val != lastval)
+			_IPATH_VDBG
+			    ("Changed from %llx to %llx, waiting for %llx bits\n",
+			     lastval, val, bits_to_wait_for);
+		yield();
+		if (get_cycles() > timeout) {
+			_IPATH_DBG
+			    ("Didn't get bits %llx in register 0x%x, got %llx\n",
+			     bits_to_wait_for, reg_id, *valp);
+			return ENODEV;
+		}
+	} while (1);
+}
+
+/*
+ * like ipath_waitfor_complete(), but we wait for the CMDVALID bit to go away
+ * indicating the last command has completed.  It doesn't return data
+ */
+int ipath_waitfor_mdio_cmdready(const ipath_type t)
+{
+	uint64_t timeout;
+	uint64_t val;
+
+	timeout = get_cycles() + 0x10000000ULL;	/* <- ridiculously long time */
+	do {
+		val = ipath_kget_kreg64(t, kr_mdio);
+		if (!(val & IPATH_MDIO_CMDVALID))
+			return 0;
+		yield();
+		if (get_cycles() > timeout) {
+			_IPATH_DBG("CMDVALID stuck in mdio reg? (%llx)\n", val);
+			return ENODEV;
+		}
+	} while (1);
+}
+
+void ipath_set_ib_lstate(const ipath_type t, int which)
+{
+	ipath_devdata *dd = &devdata[t];
+	char *what;
+
+	/*
+	 * For all cases, we'll either be setting a new value of linkcmd, or
+	 * we want it to be NOP, so clear it here.
+	 * Similarly, we want the linkinitcmd to be NOP for everything
+	 * other than explictly than explictly changing linkinitcmd,
+	 * and for that case, we want to first clear any existing bits
+	 */
+	dd->ipath_ibcctrl &= ~((INFINIPATH_IBCC_LINKCMD_MASK <<
+				INFINIPATH_IBCC_LINKCMD_SHIFT) |
+			       (INFINIPATH_IBCC_LINKINITCMD_MASK <<
+				INFINIPATH_IBCC_LINKINITCMD_SHIFT));
+
+	if (which == INFINIPATH_IBCC_LINKCMD_INIT) {
+		dd->ipath_flags &= ~(IPATH_LINK_TOARMED | IPATH_LINK_TOACTIVE
+				     | IPATH_LINK_SLEEPING);
+		/* so we can watch for a transition */
+		dd->ipath_flags |= IPATH_LINKDOWN;
+		what = "INIT";
+	} else if (which == INFINIPATH_IBCC_LINKCMD_ARMED) {
+		dd->ipath_flags |= IPATH_LINK_TOARMED;
+		dd->ipath_flags &= ~(IPATH_LINK_TOACTIVE | IPATH_LINK_SLEEPING);
+		/*
+		 * this is mainly for loopback testing.  If INITCMD is
+		 * NOP or SLEEP, the link won't ever come up in loopback...
+		 */
+		if (!
+		    (dd->
+		     ipath_flags & (IPATH_LINKINIT | IPATH_LINKARMED |
+				    IPATH_LINKACTIVE))) {
+			_IPATH_SMADBG
+			    ("going to armed, but link not  yet up, set POLL\n");
+			dd->ipath_ibcctrl |=
+			    INFINIPATH_IBCC_LINKINITCMD_POLL <<
+			    INFINIPATH_IBCC_LINKINITCMD_SHIFT;
+		}
+		what = "ARMED";
+	} else if (which == INFINIPATH_IBCC_LINKCMD_ACTIVE) {
+		dd->ipath_flags |= IPATH_LINK_TOACTIVE;
+		dd->ipath_flags &= ~(IPATH_LINK_TOARMED | IPATH_LINK_SLEEPING);
+		what = "ACTIVE";
+	} else if (which & (INFINIPATH_IBCC_LINKINITCMD_MASK << INFINIPATH_IBCC_LINKINITCMD_SHIFT)) {	/* down, disable, etc. */
+		dd->ipath_flags &= ~(IPATH_LINK_TOARMED | IPATH_LINK_TOACTIVE);
+		if (((which & INFINIPATH_IBCC_LINKINITCMD_MASK) >>
+		     INFINIPATH_IBCC_LINKINITCMD_SHIFT) ==
+		    INFINIPATH_IBCC_LINKINITCMD_SLEEP) {
+			dd->ipath_flags |= IPATH_LINK_SLEEPING | IPATH_LINKDOWN;
+		} else
+			dd->ipath_flags |= IPATH_LINKDOWN;
+		dd->ipath_ibcctrl |=
+		    which & (INFINIPATH_IBCC_LINKINITCMD_MASK <<
+			     INFINIPATH_IBCC_LINKINITCMD_SHIFT);
+		what = "DOWN";
+	} else {
+		what = "UNKNOWN";
+		_IPATH_INFO("Unknown link transition requested (which=0x%x)\n",
+			    which);
+	}
+
+	dd->ipath_ibcctrl |= ((uint64_t) which & INFINIPATH_IBCC_LINKCMD_MASK)
+	    << INFINIPATH_IBCC_LINKCMD_SHIFT;
+
+	_IPATH_SMADBG("Trying to move unit %u to %s, current ltstate is %s\n",
+			t, what, ipath_ibcstatus_str[(ipath_kget_kreg64(t, kr_ibcstatus)
+					   >> INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT)
+					  & INFINIPATH_IBCS_LINKTRAININGSTATE_MASK]);
+	ipath_kput_kreg(t, kr_ibcctrl, dd->ipath_ibcctrl);
+}
+
+static int ipath_bringup_link(const ipath_type t)
+{
+	ipath_devdata *dd = &devdata[t];
+	uint64_t val, ibc;
+	int ret = 0;
+
+	dd->ipath_control &= ~INFINIPATH_C_LINKENABLE;	/* hold IBC in reset */
+	ipath_kput_kreg(t, kr_control, dd->ipath_control);
+
+	/*
+	 * Note that prior to try 14 or 15 of IB, the credit scaling
+	 * wasn't working, because it was swapped for writes with the
+	 * 1 bit default linkstate field
+	 */
+
+	/* ignore pbc and align word */
+	val = dd->ipath_piosize - 2 * sizeof(uint32_t);
+	/*
+	 * for ICRC, which we only send in diag test pkt mode, and we don't
+	 * need to worry about that for mtu
+	 */
+	val += 1;
+	/*
+	 * set the IBC maxpktlength to the size of our pio buffers
+	 * the maxpktlength is in words.  This is *not* the IB data MTU
+	 */
+	ibc = (val / sizeof(uint32_t)) << INFINIPATH_IBCC_MAXPKTLEN_SHIFT;
+	/* in KB */
+	ibc |= 0x5ULL << INFINIPATH_IBCC_FLOWCTRLWATERMARK_SHIFT;
+	/* how often flowctrl sent
+	 * more or less in usecs; balance against watermark value, so that
+	 * in theory senders always get a flow control update in time to not
+	 * let the IB link go idle.
+	 */
+	ibc |= 0x3ULL << INFINIPATH_IBCC_FLOWCTRLPERIOD_SHIFT;
+	/* max error tolerance */
+	ibc |= 0xfULL << INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT;
+	/* use "real" buffer space for */
+	ibc |= 4ULL << INFINIPATH_IBCC_CREDITSCALE_SHIFT;
+	/* IB credit flow control. */
+	ibc |= 0xfULL << INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT;
+	/* initially come up waiting for TS1, without sending anything. */
+	dd->ipath_ibcctrl = ibc;
+	/* don't put linkinitcmd in ipath_ibcctrl, want that to stay a NOP */
+	ibc |=
+	    INFINIPATH_IBCC_LINKINITCMD_SLEEP <<
+	    INFINIPATH_IBCC_LINKINITCMD_SHIFT;
+	dd->ipath_flags |= IPATH_LINK_SLEEPING;
+	ipath_kput_kreg(t, kr_ibcctrl, ibc);
+
+	ret = ipath_bringup_serdes(t);
+
+	if (ret)
+		_IPATH_INFO("Could not initialize SerDes, not usable\n");
+	else {
+		dd->ipath_control |= INFINIPATH_C_LINKENABLE;	/* enable IBC */
+		ipath_kput_kreg(t, kr_control, dd->ipath_control);
+	}
+
+	return ret;
+}
+
+/*
+ * called from ipath_shutdown_link(), and from sma doing a LINKDOWN
+ * Left as a separate function for historical reasons, and may want
+ * it to do more than just call ipath_set_ib_lstate() again sometime
+ * in the future.
+ */
+void ipath_down_link(const ipath_type t)
+{
+	ipath_set_ib_lstate(t, INFINIPATH_IBCC_LINKINITCMD_SLEEP <<
+			    INFINIPATH_IBCC_LINKINITCMD_SHIFT);
+}
+
+/*
+ * do this when driver is being unloaded, or perhaps for diags, and
+ * maybe when we get an interrupt of a fatal link error that requires
+ * bringing the linkd down and back up
+ */
+static int ipath_shutdown_link(const ipath_type t)
+{
+	uint64_t val;
+	ipath_devdata *dd = &devdata[t];
+	int ret = 0;
+
+	_IPATH_DBG("Shutting down the link\n");
+	ipath_down_link(t);
+
+	/*
+	 * we are shutting down, so tell the layered driver.  We don't
+	 * do this on just a link state change, much like ethernet, 
+	 * a cable unplug, etc. doesn't change driver state
+	 */
+	if (dd->ipath_layer.l_intr)
+		dd->ipath_layer.l_intr(t, IPATH_LAYER_INT_IF_DOWN);
+
+	dd->ipath_control &= ~INFINIPATH_C_LINKENABLE;	/* disable IBC */
+	ipath_kput_kreg(t, kr_control, dd->ipath_control);
+
+	*dd->ipath_statusp &= ~(IPATH_STATUS_IB_CONF | IPATH_STATUS_IB_READY);
+
+	/*
+	 * clear SerdesEnable and turn the leds off; do this here because
+	 * we are unloading, so don't count on interrupts to move along
+	 */
+
+	ipath_quiet_serdes(t);
+	val = dd->ipath_extctrl &
+	    ~(INFINIPATH_EXTC_LEDPRIPORTGREENON |
+	      INFINIPATH_EXTC_LEDPRIPORTYELLOWON);
+	dd->ipath_extctrl = val;
+	ipath_kput_kreg(t, kr_extctrl, val);
+
+	if (dd->ipath_stats_timer_active) {
+		del_timer_sync(&dd->ipath_stats_timer);
+		dd->ipath_stats_timer_active = 0;
+	}
+	if (*dd->ipath_statusp & IPATH_STATUS_CHIP_PRESENT) {
+		/* can't do anything more with chip */
+		/* needs re-init */
+		*dd->ipath_statusp &= ~IPATH_STATUS_CHIP_PRESENT;
+		if (dd->ipath_kregbase) {
+			/*
+			 * if we haven't already cleaned up before these
+			 * are to ensure any register reads/writes "fail"
+			 * until re-init
+			 */
+			dd->ipath_kregbase = NULL;
+			dd->ipath_kregvirt = NULL;
+			dd->ipath_uregbase = 0ULL;
+			dd->ipath_sregbase = 0ULL;
+			dd->ipath_cregbase = 0ULL;
+			dd->ipath_kregsize = 0;
+		}
+#ifdef CONFIG_MTRR
+		if (dd->ipath_mtrr) {
+			_IPATH_VDBG("undoing WCCOMB on pio buffers\n");
+			mtrr_del(dd->ipath_mtrr, 0, 0);
+			dd->ipath_mtrr = 0;
+		}
+#endif
+	}
+
+	return ret;
+}
+
+/*
+ * when closing, free up any allocated data for a port, if the
+ * reference count goes to zero
+ * Note: this also frees the portdata itself!
+ */
+void ipath_free_pddata(ipath_devdata * dd, uint32_t port, int freehdrq)
+{
+	ipath_portdata *pd = dd->ipath_pd[port];
+
+	if (!pd)
+		return;
+	if (freehdrq)
+		/*
+		 * only clear and free portdata if we are going to
+		 * also release the hdrq, otherwise we leak the hdrq on each
+		 * open/close cycle
+		 */
+		dd->ipath_pd[port] = NULL;
+	/* cleanup locked pages private data structures */
+	ipath_mlock_cleanup(pd);
+	if (freehdrq && pd->port_rcvhdrq) {
+		int i, n = 1 << pd->port_rcvhdrq_order;
+		_IPATH_VDBG("free closed port %d rcvhdrq @ %p (order=%u)\n",
+			    pd->port_port, pd->port_rcvhdrq,
+			    pd->port_rcvhdrq_order);
+		for (i = 0; i < n; i++)
+			ClearPageReserved(virt_to_page
+					  (pd->port_rcvhdrq + (i * PAGE_SIZE)));
+		free_pages((unsigned long)pd->port_rcvhdrq,
+			   pd->port_rcvhdrq_order);
+		pd->port_rcvhdrq = NULL;
+	}
+	if (port && pd->port_rcvegrbuf_pages) {	/* always free this, however */
+		void *virt;
+		unsigned e, i, n = 1 << pd->port_rcvegrbuf_order;
+		if (pd->port_rcvegrbuf_virt) {
+			for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) {
+				virt = pd->port_rcvegrbuf_virt[e];
+				for (i = 0; i < n; i++)
+					ClearPageReserved(virt_to_page
+							  (virt +
+							   (i * PAGE_SIZE)));
+				_IPATH_VDBG
+				    ("egrbuf free_pages(%p, %x), chunk %u/%u\n",
+				     virt, pd->port_rcvegrbuf_order, e,
+				     pd->port_rcvegrbuf_chunks);
+				free_pages((unsigned long)virt,
+					   pd->port_rcvegrbuf_order);
+			}
+			vfree(pd->port_rcvegrbuf_virt);
+			pd->port_rcvegrbuf_virt = NULL;
+		}
+		pd->port_rcvegrbuf_chunks = 0;
+		_IPATH_VDBG("free closed port %d rcvegrbufs ptr array\n",
+			    pd->port_port);
+		/* now the pointer array. */
+		vfree(pd->port_rcvegrbuf_pages);
+		pd->port_rcvegrbuf_pages = NULL;
+	} else if (port == 0 && dd->ipath_port0_skbs) {
+		unsigned e;
+		struct sk_buff **skbs = dd->ipath_port0_skbs;
+
+		dd->ipath_port0_skbs = NULL;
+		_IPATH_VDBG("free closed port %d ipath_port0_skbs @ %p\n",
+			    pd->port_port, skbs);
+		for (e = 0; e < dd->ipath_rcvegrcnt; e++)
+			if (skbs[e])
+				dev_kfree_skb(skbs[e]);
+		vfree(skbs);
+	}
+	if (freehdrq) {
+		kfree(pd->port_tid_pg_list);
+		kfree(pd);
+	}
+}
+
+int __init infinipath_init(void)
+{
+	int r = 0, i;
+
+	_IPATH_DBG(KERN_INFO DRIVER_LOAD_MSG "%s", ipath_core_version);
+
+	ipath_init_picotime();	/* init cycles -> pico conversion */
+
+	if (!ipath_ctl_header) {	/* should be always */
+		if (!(ipath_ctl_header = register_sysctl_table(ipath_ctl, 1)))
+			_IPATH_INFO("Couldn't register sysctl interface\n");
+	}
+
+	/*
+	 * initialize the statusp to temporary storage so we can use it
+	 * everywhere without first checking.  When we "really" assign it,
+	 * we copy from _ipath_status
+	 */
+	for (i = 0; i < infinipath_max; i++)
+		devdata[i].ipath_statusp = &devdata[i]._ipath_status;
+
+	/*
+	 * init these early, in case we take an interrupt as soon as the irq
+	 * is setup.  Saw a spinlock panic once that appeared to be due to that
+	 * problem, when they were initted later on.
+	 */
+	spin_lock_init(&ipath_pioavail_lock);
+	spin_lock_init(&ipath_sma_lock);
+
+	pci_register_driver(&infinipath_driver);
+
+	driver_create_file(&(infinipath_driver.driver), &driver_attr_version);
+
+	if ((r = register_chrdev(ipath_major, MODNAME, &ipath_fops)))
+		_IPATH_ERROR("Unable to register %s device\n", MODNAME);
+
+
+	/*
+	 * never return an error, since we could have stuff registered,
+	 * resources used, etc., even if no hardware found.  This way we
+	 * can clean up through unload.
+	 */
+	return 0;
+}
+
+/*
+ * note: if for some reason the unload fails after this routine, and leaves
+ * the driver enterable by user code, we'll almost certainly crash and burn...
+ */
+static void __exit infinipath_cleanup(void)
+{
+	int r, m, port;
+
+	driver_remove_file(&(infinipath_driver.driver), &driver_attr_version);
+	if (ipath_ctl_header) {
+		unregister_sysctl_table(ipath_ctl_header);
+		ipath_ctl_header = NULL;
+	} else
+		_IPATH_DBG("No sysctl unregister, not registered OK\n");
+	if ((r = unregister_chrdev(ipath_major, MODNAME)))
+		_IPATH_DBG("unregister of device failed: %d\n", r);
+
+
+	/*
+	 * turn off rcv, send, and interrupts for all ports, all drivers
+	 * should also hard reset the chip here?
+	 * free up port 0 (kernel) rcvhdr, egr bufs, and eventually tid bufs
+	 * for all versions of the driver, if they were allocated
+	 */
+	for (m = 0; m < infinipath_max; m++) {
+		uint64_t val;
+		ipath_devdata *dd = &devdata[m];
+		if (dd->ipath_kregbase) {
+			/* in case unload fails, be consistent */
+			dd->ipath_rcvctrl = 0U;
+			ipath_kput_kreg(m, kr_rcvctrl, dd->ipath_rcvctrl);
+
+			/*
+			 * gracefully stop all sends allowing any in
+			 * progress to trickle out first.
+			 */
+			ipath_kput_kreg(m, kr_sendctrl, 0ULL);
+			val = ipath_kget_kreg64(m, kr_scratch);	/* flush it */
+			/*
+			 * enough for anything that's going to trickle
+			 * out to have actually done so.
+			 */
+			udelay(5);
+
+			/*
+			 * abort any armed or launched PIO buffers that
+			 * didn't go. (self clearing).	Will cause any
+			 * packet currently being transmitted to go out
+			 * with an EBP, and may also cause a short packet
+			 * error on the receiver.
+			 */
+			ipath_kput_kreg(m, kr_sendctrl, INFINIPATH_S_ABORT);
+
+			/* mask interrupts, but not errors */
+			ipath_kput_kreg(m, kr_intmask, 0ULL);
+			ipath_shutdown_link(m);
+
+			/*
+			 * clear all interrupts and errors.  Next time
+			 * driver is loaded, we know that whatever is
+			 * set happened while we were unloaded
+			 */
+			ipath_kput_kreg(m, kr_hwerrclear, ~0ULL);
+			ipath_kput_kreg(m, kr_errorclear, ~0ULL);
+			ipath_kput_kreg(m, kr_intclear, ~0ULL);
+			if (dd->__ipath_pioavailregs_base) {
+				kfree((void *)dd->__ipath_pioavailregs_base);
+				dd->__ipath_pioavailregs_base =
+				    dd->ipath_pioavailregs_dma = 0;
+			}
+
+			if (dd->ipath_pageshadow) {
+				struct page **tmpp = dd->ipath_pageshadow;
+				int i, cnt = 0;
+
+				_IPATH_VDBG
+				    ("Unlocking any expTID pages still locked\n");
+				for (port = 0; port < dd->ipath_cfgports;
+				     port++) {
+					int port_tidbase =
+					    port * dd->ipath_rcvtidcnt;
+					int maxtid =
+					    port_tidbase + dd->ipath_rcvtidcnt;
+					for (i = port_tidbase; i < maxtid; i++) {
+						if (tmpp[i]) {
+							ipath_munlock(1,
+								      &tmpp[i]);
+							tmpp[i] = 0;
+							cnt++;
+						}
+					}
+				}
+				if (cnt) {
+					ipath_stats.sps_pageunlocks += cnt;
+					_IPATH_VDBG
+					    ("There were still %u expTID entries locked\n",
+					     cnt);
+				}
+				if (ipath_stats.sps_pagelocks
+				    || ipath_stats.sps_pageunlocks)
+					_IPATH_VDBG
+					    ("%llu pages locked, %llu unlocked via ipath_m{un}lock\n",
+					     ipath_stats.sps_pagelocks,
+					     ipath_stats.sps_pageunlocks);
+
+				_IPATH_VDBG
+				    ("Free shadow page tid array at %p\n",
+				     dd->ipath_pageshadow);
+				vfree(dd->ipath_pageshadow);
+				dd->ipath_pageshadow = NULL;
+			}
+
+			/*
+			 * free any resources still in use (usually just
+			 * kernel ports) at unload
+			 */
+			for (port = 0; port < dd->ipath_cfgports; port++)
+				ipath_free_pddata(dd, port, 1);
+			kfree(dd->ipath_pd);
+			/*
+			 * debuggability, in case some cleanup path
+			 * tries to use it after this
+			 */
+			dd->ipath_pd = NULL;
+		}
+
+		if (dd->pcidev) {
+			if (dd->pcidev->irq) {
+				_IPATH_VDBG("unit %u free_irq of irq %x\n", m,
+					    dd->pcidev->irq);
+				free_irq(dd->pcidev->irq, dd);
+			} else
+				_IPATH_DBG
+				    ("irq is 0, not doing free_irq for unit %u\n",
+				     m);
+			dd->pcidev = NULL;
+		}
+		if (dd->pci_registered) {
+			_IPATH_VDBG
+			    ("Unregistering pci infrastructure unit %u\n", m);
+			pci_unregister_driver(&infinipath_driver);
+			dd->pci_registered = 0;
+		} else
+			_IPATH_VDBG
+			    ("unit %u: no pci unreg, wasn't registered\n", m);
+		ipath_chip_cleanup(dd);	/* clean up any per-chip chip-specific stuff */
+	}
+	/*
+	 * clean up any chip-specific stuff for now, only one type of chip
+	 * for any given driver
+	 */
+	ipath_chip_done();
+
+	/* cleanup all our locked pages private data structures */
+	ipath_mlock_cleanup(NULL);
+}
+
+/* This is a generic function here, so it can return device-specific
+ * info.  This allows keeping in sync with the version that supports
+ * multiple chip types.
+*/
+void ipath_get_boardname(const ipath_type t, char *name, size_t namelen)
+{
+	ipath_ht_get_boardname(t, name, namelen);
+}
+
+module_init(infinipath_init);
+module_exit(infinipath_cleanup);
+
+EXPORT_SYMBOL(infinipath_debug);
+EXPORT_SYMBOL(ipath_get_boardname);
-- 
0.99.9n
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/