Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1750921AbVLPXvK (ORCPT ); Fri, 16 Dec 2005 18:51:10 -0500 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S964821AbVLPXuM (ORCPT ); Fri, 16 Dec 2005 18:50:12 -0500 Received: from sj-iport-5.cisco.com ([171.68.10.87]:10862 "EHLO sj-iport-5.cisco.com") by vger.kernel.org with ESMTP id S964818AbVLPXtP (ORCPT ); Fri, 16 Dec 2005 18:49:15 -0500 X-IronPort-AV: i="3.99,263,1131350400"; d="scan'208"; a="242176712:sNHT1955315644" Subject: [PATCH 06/13] [RFC] ipath LLD core, part 3 In-Reply-To: <200512161548.YvnmQHKTsmmCBp1k@cisco.com> X-Mailer: Roland's Patchbomber Date: Fri, 16 Dec 2005 15:48:55 -0800 Message-Id: <200512161548.KglSM2YESlGlEQfQ@cisco.com> Mime-Version: 1.0 Content-Type: text/plain; charset=US-ASCII To: linux-kernel@vger.kernel.org, openib-general@openib.org Content-Transfer-Encoding: 7BIT From: Roland Dreier X-OriginalArrivalTime: 16 Dec 2005 23:48:57.0042 (UTC) FILETIME=[478FA320:01C6029B] Sender: linux-kernel-owner@vger.kernel.org X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 78986 Lines: 2403 Last part of core driver --- drivers/infiniband/hw/ipath/ipath_driver.c | 2380 ++++++++++++++++++++++++++++ 1 files changed, 2380 insertions(+), 0 deletions(-) f7ffc0cabd62be5e13ad84027d5712e6f92d9cc1 diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c index 0dee4ce..87b6dae 100644 --- a/drivers/infiniband/hw/ipath/ipath_driver.c +++ b/drivers/infiniband/hw/ipath/ipath_driver.c @@ -4877,3 +4877,2383 @@ static int ipath_wait_intr(ipath_portdat } return 0; } + +/* + * The new implementation as of Oct 2004 is that the driver assigns + * the tid and returns it to the caller. To make it easier to + * catch bugs, and to reduce search time, we keep a cursor for + * each port, walking the shadow tid array to find one that's not + * in use. + * + * For now, if we can't allocate the full list, we fail, although + * in the long run, we'll allocate as many as we can, and the + * caller will deal with that by trying the remaining pages later. + * That means that when we fail, we have to mark the tids as not in + * use again, in our shadow copy. + * + * It's up to the caller to free the tids when they are done. + * We'll unlock the pages as they free them. + * + * Also, right now we are locking one page at a time, but since + * the intended use of this routine is for a single group of + * virtually contiguous pages, that should change to improve + * performance. + */ +static int ipath_tid_update(ipath_portdata * pd, struct _tidupd *tidu) +{ + int ret = 0, ntids; + uint32_t tid, porttid, cnt, i, tidcnt; + struct _tidupd tu; + uint16_t *tidlist; + ipath_devdata *dd = &devdata[pd->port_unit]; + uint64_t vaddr, physaddr, lenvalid; + volatile uint64_t *tidbase; + uint64_t tidmap[8]; + struct page **pagep = NULL; + + tu.tidcnt = 0; /* for early errors */ + if (!dd->ipath_pageshadow) { + ret = -ENOMEM; + goto done; + } + if (copy_from_user(&tu, tidu, sizeof tu)) { + ret = -EFAULT; + goto done; + } + + if (!(cnt = tu.tidcnt)) { + _IPATH_DBG("After copyin, tidcnt 0, tidlist %llx\n", + tu.tidlist); + /* or should we treat as success? likely a bug */ + ret = -EFAULT; + goto done; + } + tidcnt = dd->ipath_rcvtidcnt; + if (cnt >= tidcnt) { /* make sure it all fits in port_tid_pg_list */ + _IPATH_INFO + ("Process tried to allocate %u TIDs, only trying max (%u)\n", + cnt, tidcnt); + cnt = tidcnt; + } + pagep = (struct page **)pd->port_tid_pg_list; + tidlist = (uint16_t *) (&pagep[cnt]); + + memset(tidmap, 0, sizeof(tidmap)); + tid = pd->port_tidcursor; + /* before decrement; chip actual # */ + porttid = pd->port_port * tidcnt; + ntids = tidcnt; + tidbase = (volatile uint64_t *)((volatile char *) + (devdata[pd->port_unit]. + ipath_kregbase) + + devdata[pd->port_unit]. + ipath_rcvtidbase + + porttid * sizeof(*tidbase)); + + _IPATH_VDBG("Port%u %u tids, cursor %u, tidbase %p\n", pd->port_port, + cnt, tid, tidbase); + + vaddr = tu.tidvaddr; /* virtual address of first page in transfer */ + if (!access_ok(VERIFY_WRITE, (void *)vaddr, cnt * PAGE_SIZE)) { + _IPATH_DBG("Fail vaddr %llx, %u pages, !access_ok\n", + vaddr, cnt); + ret = -EFAULT; + goto done; + } + if ((ret = ipath_mlock((unsigned long)vaddr, cnt, pagep))) { + if (ret == -EBUSY) { + _IPATH_DBG + ("Failed to lock addr %p, %u pages (already locked)\n", + (void *)vaddr, cnt); + /* + * for now, continue, and see what happens + * but with the new implementation, this should + * never happen, unless perhaps the user has + * mpin'ed the pages themselves (something we + * need to test) + */ + ret = 0; + } else { + _IPATH_INFO + ("Failed to lock addr %p, %u pages: errno %d\n", + (void *)vaddr, cnt, -ret); + goto done; + } + } + for (i = 0; i < cnt; i++, vaddr += PAGE_SIZE) { + for (; ntids--; tid++) { + if (tid == tidcnt) + tid = 0; + if (!dd->ipath_pageshadow[porttid + tid]) + break; + } + if (ntids < 0) { + /* + * oops, wrapped all the way through their TIDs, + * and didn't have enough free; see comments at + * start of routine + */ + _IPATH_DBG + ("Not enough free TIDs for %u pages (index %d), failing\n", + cnt, i); + i--; /* last tidlist[i] not filled in */ + ret = -ENOMEM; + break; + } + tidlist[i] = tid; + _IPATH_VDBG("Updating idx %u to TID %u, vaddr %llx\n", + i, tid, vaddr); + /* for now we "know" system pages and TID pages are same size */ + /* for ipath_free_tid */ + dd->ipath_pageshadow[porttid + tid] = pagep[i]; + __set_bit(tid, tidmap); /* don't need atomic or it's overhead */ + physaddr = page_to_phys(pagep[i]); + ipath_stats.sps_pagelocks++; + _IPATH_VDBG("TID %u, vaddr %llx, physaddr %llx pgp %p\n", + tid, vaddr, physaddr, pagep[i]); + /* + * in words (fixed, full page). could make less for very last + * page in transfer, but for now we won't worry about it. + */ + lenvalid = PAGE_SIZE >> 2; + lenvalid <<= INFINIPATH_RT_BUFSIZE_SHIFT; + physaddr |= lenvalid | INFINIPATH_RT_VALID; + ipath_kput_memq(pd->port_unit, &tidbase[tid], physaddr); + /* + * don't check this tid in ipath_portshadow, since we + * just filled it in; start with the next one. + */ + tid++; + } + + if (ret) { + uint32_t limit; + uint64_t tidval; + /* + * chip errata bug 7358, try to work around it by + * marking invalid tids as having max length + */ + tidval = + (~0ULL & INFINIPATH_RT_BUFSIZE_MASK) << + INFINIPATH_RT_BUFSIZE_SHIFT; + cleanup: + /* jump here if copy out of updated info failed... */ + _IPATH_DBG("After failure (ret=%d), undo %d of %d entries\n", + -ret, i, cnt); + /* same code that's in ipath_free_tid() */ + if ((limit = sizeof(tidmap) * _BITS_PER_BYTE) > tidcnt) + /* just in case size changes in future */ + limit = tidcnt; + tid = find_first_bit((const unsigned long *)tidmap, limit); + /* + * chip errata bug 7358, try to work around it by + * marking invalid tids as having max length + */ + tidval = + (~0ULL & INFINIPATH_RT_BUFSIZE_MASK) << + INFINIPATH_RT_BUFSIZE_SHIFT; + for (; tid < limit; tid++) { + if (!test_bit(tid, tidmap)) + continue; + if (dd->ipath_pageshadow[porttid + tid]) { + _IPATH_VDBG("Freeing TID %u\n", tid); + ipath_kput_memq(pd->port_unit, &tidbase[tid], + tidval); + dd->ipath_pageshadow[porttid + tid] = NULL; + ipath_stats.sps_pageunlocks++; + } + } + (void)ipath_munlock(cnt, pagep); + } else { + /* + * copy the updated array, with ipath_tid's filled in, + * back to user. Since we did the copy in already, this + * "should never fail" + * If it does, we have to clean up... + */ + int r; + if ((r = + copy_to_user((void *)tu.tidlist, tidlist, + cnt * sizeof(*tidlist)))) { + _IPATH_DBG + ("Failed to copy out %d TIDs (%lx bytes) to %llx (ret %x)\n", + cnt, cnt * sizeof(*tidlist), tu.tidlist, r); + ret = -EFAULT; + goto cleanup; + } + if (copy_to_user((void *)tu.tidmap, tidmap, sizeof tidmap)) { + _IPATH_DBG("Failed to copy out TID map to %llx\n", + tu.tidmap); + ret = -EFAULT; + goto cleanup; + } + if (tid == tidcnt) + tid = 0; + pd->port_tidcursor = tid; + } + +done: + if (ret) + _IPATH_DBG + ("Failed to map %u TID pages, failing with %d, tidu %p\n", + tu.tidcnt, -ret, tidu); + return ret; +} + +/* + * right now we are unlocking one page at a time, but since + * the intended use of this routine is for a single group of + * virtually contiguous pages, that should change to improve + * performance. We check that the TID is in range for this port + * but otherwise don't check validity; if user has an error and + * frees the wrong tid, it's only their own data that can thereby + * be corrupted. We do check that the TID was in use, for sanity + * We always use our idea of the saved address, not the address that + * they pass in to us. + */ + +static int ipath_tid_free(ipath_portdata * pd, struct _tidupd *tidu) +{ + int ret = 0; + uint32_t tid, porttid, cnt, limit, tidcnt; + struct _tidupd tu; + ipath_devdata *dd = &devdata[pd->port_unit]; + uint64_t *tidbase; + uint64_t tidmap[8]; + uint64_t tidval; + + tu.tidcnt = 0; /* for early errors */ + if (!dd->ipath_pageshadow) { + ret = -ENOMEM; + goto done; + } + + if (copy_from_user(&tu, tidu, sizeof tu)) { + _IPATH_DBG("copy of tidupd structure failed\n"); + ret = -EFAULT; + goto done; + } + if (copy_from_user(tidmap, (void *)tu.tidmap, sizeof tidmap)) { + _IPATH_DBG("copy of tidmap failed\n"); + ret = -EFAULT; + goto done; + } + + porttid = pd->port_port * dd->ipath_rcvtidcnt; + tidbase = + (uint64_t *) ((char *)(devdata[pd->port_unit].ipath_kregbase) + + devdata[pd->port_unit].ipath_rcvtidbase + + porttid * sizeof(*tidbase)); + + tidcnt = dd->ipath_rcvtidcnt; + if ((limit = sizeof(tidmap) * _BITS_PER_BYTE) > tidcnt) + limit = tidcnt; /* just in case size changes in future */ + tid = find_first_bit((const unsigned long *)tidmap, limit); + _IPATH_VDBG + ("Port%u free %u tids; first bit (max=%d) set is %d, porttid %u\n", + pd->port_port, tu.tidcnt, limit, tid, porttid); + /* + * chip errata bug 7358, try to work around it by marking invalid + * tids as having max length + */ + tidval = + (~0ULL & INFINIPATH_RT_BUFSIZE_MASK) << INFINIPATH_RT_BUFSIZE_SHIFT; + for (cnt = 0; tid < limit; tid++) { + /* + * small optimization; if we detect a run of 3 or so without + * any set, use find_first_bit again. That's mainly to + * accelerate the case where we wrapped, so we have some at + * the beginning, and some at the end, and a big gap + * in the middle. + */ + if (!test_bit(tid, tidmap)) + continue; + cnt++; + if (dd->ipath_pageshadow[porttid + tid]) { + _IPATH_VDBG("Freeing TID %u\n", tid); + ipath_kput_memq(pd->port_unit, &tidbase[tid], tidval); + ipath_munlock(1, &dd->ipath_pageshadow[porttid + tid]); + dd->ipath_pageshadow[porttid + tid] = NULL; + ipath_stats.sps_pageunlocks++; + } else + _IPATH_DBG("Unused tid %u, ignoring\n", tid); + } + if (cnt != tu.tidcnt) + _IPATH_DBG("passed in tidcnt %d, only %d bits set in map\n", + tu.tidcnt, cnt); +done: + if (ret) + _IPATH_DBG("Failed to unmap %u TID pages, failing with %d\n", + tu.tidcnt, -ret); + return ret; +} + +/* called from user init code, and also layered driver init */ +int ipath_setrcvhdrsize(const ipath_type mdev, unsigned rhdrsize) +{ + int ret = 0; + if (devdata[mdev].ipath_flags & IPATH_RCVHDRSZ_SET) { + if (devdata[mdev].ipath_rcvhdrsize != rhdrsize) { + _IPATH_INFO + ("Error: can't set protocol header size %u, already %u\n", + rhdrsize, devdata[mdev].ipath_rcvhdrsize); + ret = -EAGAIN; + } else + /* OK if set already, with same value, nothing to do */ + _IPATH_VDBG("Reuse same protocol header size %u\n", + devdata[mdev].ipath_rcvhdrsize); + } else if (rhdrsize > + (devdata[mdev].ipath_rcvhdrentsize - + (sizeof(uint64_t) / sizeof(uint32_t)))) { + _IPATH_DBG + ("Error: can't set protocol header size %u (> max %u)\n", + rhdrsize, + devdata[mdev].ipath_rcvhdrentsize - + (uint32_t) (sizeof(uint64_t) / sizeof(uint32_t))); + ret = -EOVERFLOW; + } else { + devdata[mdev].ipath_flags |= IPATH_RCVHDRSZ_SET; + devdata[mdev].ipath_rcvhdrsize = rhdrsize; + ipath_kput_kreg(mdev, kr_rcvhdrsize, + devdata[mdev].ipath_rcvhdrsize); + _IPATH_VDBG("Set protocol header size to %u\n", + devdata[mdev].ipath_rcvhdrsize); + } + return ret; +} + +/* + * find an available pio buffer, and do appropriate marking as busy, etc. + * returns buffer number if one found (>=0), negative number is error. + * Used by ipath_send_smapkt and ipath_layer_send + */ +int ipath_getpiobuf(int mdev) +{ + int i, j, starti, updated = 0; + unsigned piobcnt, iter; + unsigned long flags; + ipath_devdata *dd = &devdata[mdev]; + uint64_t *shadow = dd->ipath_pioavailshadow; + + piobcnt = (unsigned)dd->ipath_piobcnt; + starti = dd->ipath_lastport_piobuf; + iter = piobcnt - starti; + if (dd->ipath_upd_pio_shadow) { + /* + * minor optimization. If we had no buffers on last call, + * start out by doing the update; continue and do scan + * even if no buffers were updated, to be paranoid + */ + ipath_update_pio_bufs(mdev); + /* we scanned here, don't do it at end of scan */ + updated = 1; + i = starti; + } else + i = dd->ipath_lastpioindex; + +rescan: + /* + * while test_and_set_bit() is atomic, + * we do that and then the change_bit(), and the pair is not. + * See if this is the cause of the remaining armlaunch errors. + */ + spin_lock_irqsave(&ipath_pioavail_lock, flags); + for (j = 0; j < iter; j++, i++) { + if (i >= piobcnt) + i = starti; + /* + * To avoid bus lock overhead, we first find a candidate + * buffer, then do the test and set, and continue if + * that fails. + */ + if (test_bit((2 * i) + 1, shadow) || + test_and_set_bit((2 * i) + 1, shadow)) { + continue; + } + /* flip generation bit */ + change_bit(2 * i, shadow); + break; + } + spin_unlock_irqrestore(&ipath_pioavail_lock, flags); + + if (j == iter) { + /* + * first time through; shadow exhausted, but may be + * real buffers available, so go see; if any updated, + * rescan (once) + */ + if (!updated) { + ipath_update_pio_bufs(mdev); + updated = 1; + i = starti; + goto rescan; + } + dd->ipath_upd_pio_shadow = 1; + /* not atomic, but if we lose one once in a while, that's OK */ + ipath_stats.sps_nopiobufs++; + if (!(++dd->ipath_consec_nopiobuf % 100000)) { + _IPATH_DBG + ("%u pio sends with no bufavail; dmacopy: %llx %llx %llx %llx; shadow: %llx %llx %llx %llx\n", + dd->ipath_consec_nopiobuf, + dd->ipath_pioavailregs_dma[0], + dd->ipath_pioavailregs_dma[1], + dd->ipath_pioavailregs_dma[2], + dd->ipath_pioavailregs_dma[3], + shadow[0], shadow[1], shadow[2], shadow[3]); + /* + * 4 buffers per byte, 4 registers above, cover + * rest below + */ + if (dd->ipath_piobcnt > (sizeof(shadow[0]) * 4 * 4)) + _IPATH_DBG + ("2nd group: dmacopy: %llx %llx %llx %llx; shadow: %llx %llx %llx %llx\n", + dd->ipath_pioavailregs_dma[4], + dd->ipath_pioavailregs_dma[5], + dd->ipath_pioavailregs_dma[6], + dd->ipath_pioavailregs_dma[7], + shadow[4], shadow[5], shadow[6], + shadow[7]); + } + return -EBUSY; + } + + if (updated && dd->ipath_layer.l_intr) { + /* + * ran out of bufs, now some (at least this one we just got) + * are now available, so tell the layered driver. + */ + dd->ipath_layer.l_intr(mdev, IPATH_LAYER_INT_SEND_CONTINUE); + } + + /* + * set next starting place. Since it's just an optimization, + * it doesn't matter who wins on this, so no locking + */ + dd->ipath_lastpioindex = i + 1; + if(dd->ipath_upd_pio_shadow) + dd->ipath_upd_pio_shadow = 0; + if(dd->ipath_consec_nopiobuf) + dd->ipath_consec_nopiobuf = 0; + return i; +} + +/* + * this is like ipath_getpiobuf(), except it just probes to see if a buffer + * is available. If it returns that there is one, it's not allocated, + * and so may not be available if caller tries to send. + * NOTE: This can be called from interrupt context by ipath_intr() + * and from non-interrupt context by layer_send_getpiobuf(). + */ +int ipath_bufavail(int mdev) +{ + int i; + unsigned piobcnt; + uint64_t *shadow = devdata[mdev].ipath_pioavailshadow; + + piobcnt = (unsigned)devdata[mdev].ipath_piobcnt; + + for (i = devdata[mdev].ipath_lastport_piobuf; i < piobcnt; i++) + if (!test_bit((2 * i) + 1, shadow)) + return 1; + + /* if none, check for update and rescan if we updated */ + ipath_update_pio_bufs(mdev); + for (i = devdata[mdev].ipath_lastport_piobuf; i < piobcnt; i++) + if (!test_bit((2 * i) + 1, shadow)) + return 1; + _IPATH_PDBG("No bufs avail\n"); + return 0; +} + +/* + * This routine is no longer on any critical paths; it is used only + * for sending SMA packets, but that could change in the future, so it + * should be kept pretty tight, with anything that + * increases the cache footprint, adds branches, etc. carefully + * examined, and if needed only for unusual cases, should, be moved out to + * a separate routine, or out of the main execution path. + * Because it's currently sma only, there are no checks to see if the + * link is up; sma must be able to send in the not fully initialized state + */ +int ipath_send_smapkt(struct ipath_sendpkt * upkt) +{ + int i, ret = 0, whichpb; + uint32_t *piobuf, plen = 0, clen; + uint64_t pboff; + struct ipath_sendpkt kpkt; + struct ipath_iovec *iov = kpkt.sps_iov; + ipath_type t; + + if (unlikely((copy_from_user(&kpkt, upkt, sizeof kpkt)))) + ret = -EFAULT; + if (ret) { + _IPATH_VDBG("Send failed: error %d\n", -ret); + goto done; + } + t = kpkt.sps_flags; + if (t >= infinipath_max || !(devdata[t].ipath_flags & IPATH_PRESENT) || + !devdata[t].ipath_kregbase) { + _IPATH_SMADBG("illegal unit %u for sma send\n", t); + return -ENODEV; + } + if (!(devdata[t].ipath_flags & IPATH_INITTED)) { + /* no hardware, freeze, etc. */ + _IPATH_SMADBG("unit %u not usable\n", t); + return -ENODEV; + } + + /* need total length before first word written */ + plen = sizeof(uint32_t); /* +1 word is for the qword padding */ + for (i = 0; i < kpkt.sps_cnt; i++) + /* each must be dword multiple */ + plen += kpkt.sps_iov[i].iov_len; + + if ((plen + 4) > devdata[t].ipath_ibmaxlen) { + _IPATH_DBG("Pkt len 0x%x > ibmaxlen %x!\n", plen - 4, + devdata[t].ipath_ibmaxlen); + ret = -EINVAL; + goto done; /* before writing pbc */ + } + plen >>= 2; /* in words */ + + whichpb = ipath_getpiobuf(t); + if (whichpb < 0) { + ret = whichpb; + devdata[t].ipath_nosma_bufs++; + _IPATH_SMADBG("No PIO buffers available unit %u %u times\n", + t, devdata[t].ipath_nosma_bufs); + goto done; + } + if(devdata[t].ipath_nosma_bufs) { + _IPATH_SMADBG( + "Unit %u got SMA send buffer after %u failures, %u seconds\n", + t, devdata[t].ipath_nosma_bufs, devdata[t].ipath_nosma_secs); + devdata[t].ipath_nosma_bufs = 0; + devdata[t].ipath_nosma_secs = 0; + } + if((devdata[t].ipath_lastibcstat & 0x11) != 0x11 && + (devdata[t].ipath_lastibcstat & 0x21) != 0x21) { + /* we need to be at least at INIT for SMA packets to go out. If we + * aren't, something has gone wrong, and SMA hasn't noticed. + * Therefore we'll try to go to INIT here, in hopes of fixing up the + * problem. First we verify that indeed the state is still "bad" + * (that is, that lastibcstat * isn't "stale") */ + uint64_t val; + val = ipath_kget_kreg64(t, kr_ibcstatus); + if((val & 0x11) != 0x11 && (val & 0x21) != 0x21) { + _IPATH_SMADBG("Invalid Link state 0x%llx unit %u for send, try INIT\n", + val, t); + ipath_set_ib_lstate(t, INFINIPATH_IBCC_LINKCMD_INIT); + val = ipath_kget_kreg64(t, kr_ibcstatus); + if((val & 0x11) != 0x11 && (val & 0x21) != 0x21) + _IPATH_SMADBG("Link state still not OK unit %u (0x%llx) after INIT\n", + t, val); + else + _IPATH_SMADBG("Link state OK unit %u (0x%llx) after INIT\n", + t, val); + } + /* and continue, regardless */ + } + + pboff = devdata[t].ipath_piobufbase; + piobuf = (uint32_t *) (((char *)(devdata[t].ipath_kregbase)) + pboff + + whichpb * devdata[t].ipath_palign); + + if(infinipath_debug & __IPATH_PKTDBG) // SMA and PKT, both + _IPATH_SMADBG("unit %u 0x%x+1w pio%d, (scnt %d)\n", + t, plen - 1, whichpb, kpkt.sps_cnt); + + ret = 0; + clen = 2; /* size of the pbc */ + { + /* + * If this code ever gets used for anything performance + * oriented, or that isn't inherently single-threaded, + * then I need to implement the original idea of our + * own equivalent of copy_from_user that uses only dword + * or qword copies. copy_from_user() can use byte copies, + * and that is a problem for our chip. + */ + static uint32_t tmpbuf[2176 / sizeof(uint32_t)]; + *(uint64_t *) tmpbuf = (uint64_t) plen; + for (i = 0; i < kpkt.sps_cnt; i++) { + if (unlikely + (copy_from_user + (tmpbuf + clen, (void *)iov->iov_base, + iov->iov_len))) + ret = -EFAULT; /* no break */ + clen += iov->iov_len >> 2; + iov++; + } + ipath_dwordcpy(piobuf, tmpbuf, clen); + } + + /* flush the packet out now, don't leave it waiting around */ + mb(); + + if (ret) { + /* + * Packet is bad, so we need to use the PIO abort mechanism to + * abort the packet + */ + uint32_t sendctrl; + sendctrl = devdata[t].ipath_sendctrl | INFINIPATH_S_DISARM | + (whichpb << INFINIPATH_S_DISARMPIOBUF_SHIFT); + _IPATH_DBG("Doing PIO abort on buffer %u after error\n", + whichpb); + ipath_kput_kreg(t, kr_sendctrl, sendctrl); + } + +done: + return ret; +} + +/* + * implemention of the ioctl to get the counter values from the chip + * For the time being, we get all of them when asked, no shadowing. + * We need to shadow the byte counters at a minimum, because otherwise + * they will wrap in just a few seconds at full bandwidth + * The second argument is the user address to which we do the copy_to_user() + */ +static int ipath_get_counters(ipath_type t, + struct infinipath_counters * ucounters) +{ + int ret = 0; + uint64_t val; + uint64_t *ucreg; + uint16_t vcreg; + + ucreg = (uint64_t *) ucounters; + /* + * for now, let's do this one at a time. It's not the most + * optimal method, but it is simple, and has no intermediate + * memory requirements. + */ + for (vcreg = 0; + vcreg < (sizeof(struct infinipath_counters) / sizeof(val)); + vcreg++, ucreg++) { + ipath_creg creg = vcreg; + val = ipath_snap_cntr(t, creg); + if ((ret = copy_to_user(ucreg, &val, sizeof(val)))) { + _IPATH_DBG("copy_to_user error on counter %d\n", creg); + break; + } + } + + return ret; +} + +/* + * implemention of the ioctl to get the stats values from the driver + * The argument is the user address to which we do the copy_to_user() + */ +static int ipath_get_stats(struct infinipath_stats *ustats) +{ + int ret = 0; + + if ((ret = copy_to_user(ustats, &ipath_stats, sizeof(ipath_stats)))) + _IPATH_DBG("copy_to_user error on driver stats\n"); + + return ret; +} + +/* set a partition key. We can have up to 4 active at a time (other than + * the default, which is always allowed). This is somewhat tricky, since + * multiple ports may set the same key, so we reference count them, and + * clean up at exit. All 4 partition keys are packed into a single + * infinipath register. It's an error for a process to set the same + * pkey multiple times. We provide no mechanism to de-allocate a pkey + * at this time, we may eventually need to do that. + * I've used the atomic operations, and no locking, and only make a single + * pass through what's available. This should be more than adequate for + * some time. I'll think about spinlocks or the like if and as it's necessary + */ +static int ipath_set_partkey(ipath_portdata *pd, uint16_t key) +{ + ipath_devdata *dd; + int i, any = 0, pidx = -1; + uint16_t lkey = key & 0x7FFF; + + dd = &devdata[pd->port_unit]; + + if (lkey == (IPS_DEFAULT_P_KEY & 0x7FFF)) { + /* nothing to do; this key always valid */ + return 0; + } + + _IPATH_VDBG + ("p%u try to set pkey %hx, current keys %hx:%x %hx:%x %hx:%x %hx:%x\n", + pd->port_port, key, dd->ipath_pkeys[0], + atomic_read(&dd->ipath_pkeyrefs[0]), dd->ipath_pkeys[1], + atomic_read(&dd->ipath_pkeyrefs[1]), dd->ipath_pkeys[2], + atomic_read(&dd->ipath_pkeyrefs[2]), dd->ipath_pkeys[3], + atomic_read(&dd->ipath_pkeyrefs[3])); + + if (!lkey) { + _IPATH_PRDBG("p%u tries to set key 0, not allowed\n", + pd->port_port); + return -EINVAL; + } + + /* + * Set the full membership bit, because it has to be + * set in the register or the packet, and it seems + * cleaner to set in the register than to force all + * callers to set it. (see bug 4331) + */ + key |= 0x8000; + + for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) { + if (!pd->port_pkeys[i] && pidx == -1) + pidx = i; + if (pd->port_pkeys[i] == key) { + _IPATH_VDBG + ("p%u tries to set same pkey (%x) more than once\n", + pd->port_port, key); + return -EEXIST; + } + } + if (pidx == -1) { + _IPATH_DBG + ("All pkeys for port %u already in use, can't set %x\n", + pd->port_port, key); + return -EBUSY; + } + for (any = i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) { + if (!dd->ipath_pkeys[i]) { + any++; + continue; + } + if (dd->ipath_pkeys[i] == key) { + if (atomic_inc_return(&dd->ipath_pkeyrefs[i]) > 1) { + pd->port_pkeys[pidx] = key; + _IPATH_VDBG + ("p%u set key %x matches #%d, count now %d\n", + pd->port_port, key, i, + atomic_read(&dd->ipath_pkeyrefs[i])); + return 0; + } else { + /* lost race, decrement count, catch below */ + atomic_dec(&dd->ipath_pkeyrefs[i]); + _IPATH_VDBG + ("Lost race, count was 0, after dec, it's %d\n", + atomic_read(&dd->ipath_pkeyrefs[i])); + any++; + } + } + if ((dd->ipath_pkeys[i] & 0x7FFF) == lkey) { + /* + * It makes no sense to have both the limited and full + * membership PKEY set at the same time since the + * unlimited one will disable the limited one. + */ + return -EEXIST; + } + } + if (!any) { + _IPATH_DBG + ("port %u, all pkeys already in use, can't set %x\n", + pd->port_port, key); + return -EBUSY; + } + for (any = i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) { + if (!dd->ipath_pkeys[i] && + atomic_inc_return(&dd->ipath_pkeyrefs[i]) == 1) { + uint64_t pkey; + + /* for ipathstats, etc. */ + ipath_stats.sps_pkeys[i] = lkey; + pd->port_pkeys[pidx] = dd->ipath_pkeys[i] = key; + pkey = + (uint64_t) dd->ipath_pkeys[0] | + ((uint64_t) dd->ipath_pkeys[1] << 16) | + ((uint64_t) dd->ipath_pkeys[2] << 32) | + ((uint64_t) dd->ipath_pkeys[3] << 48); + _IPATH_PRDBG + ("p%u set key %x in #%d, portidx %d, new pkey reg %llx\n", + pd->port_port, key, i, pidx, pkey); + ipath_kput_kreg(pd->port_unit, kr_partitionkey, pkey); + + return 0; + } + } + _IPATH_DBG + ("port %u, all pkeys already in use 2nd pass, can't set %x\n", + pd->port_port, key); + return -EBUSY; +} + +/* + * stop_start == 0 disables receive on the port, for use in queue overflow + * conditions. stop_start==1 re-enables, and returns value of tail register, + * to be used to re-init the software copy of the head register + */ + +static int ipath_manage_rcvq(ipath_portdata * pd, uint16_t start_stop) +{ + ipath_devdata *dd; + /* + * This needs to be volatile, so that the compiler doesn't + * optimize away the read to the device's mapped memory. + */ + volatile uint64_t tval; + + dd = &devdata[pd->port_unit]; + _IPATH_PRDBG("%sabling rcv for unit %u port %u\n", + start_stop ? "en" : "dis", pd->port_unit, pd->port_port); + /* atomically clear receive enable port. */ + if (start_stop) { + /* + * on enable, force in-memory copy of the tail register + * to 0, so that protocol code doesn't have to worry + * about whether or not the chip has yet updated + * the in-memory copy or not on return from the system + * call. The chip always resets it's tail register back + * to 0 on a transition from disabled to enabled. + * This could cause a problem if software was broken, + * and did the enable w/o the disable, but eventually + * the in-memory copy will be updated and correct + * itself, even in the face of software bugs. + */ + *pd->port_rcvhdrtail_kvaddr = 0; + atomic_set_mask(1U << + (INFINIPATH_R_PORTENABLE_SHIFT + pd->port_port), + &dd->ipath_rcvctrl); + } else + atomic_clear_mask(1U << + (INFINIPATH_R_PORTENABLE_SHIFT + + pd->port_port), &dd->ipath_rcvctrl); + ipath_kput_kreg(pd->port_unit, kr_rcvctrl, dd->ipath_rcvctrl); + /* now be sure chip saw it before we return */ + tval = ipath_kget_kreg64(pd->port_unit, kr_scratch); + if (start_stop) { + /* + * and try to be sure that tail reg update has happened + * too. This should in theory interlock with the RXE + * changes to the tail register. Don't assign it to + * the tail register in memory copy, since we could + * overwrite an update by the chip if we did. + */ + tval = + ipath_kget_ureg32(pd->port_unit, ur_rcvhdrtail, + pd->port_port); + } + /* always; new head should be equal to new tail; see above */ + return 0; +} + +/* + * This routine is now quite different for user and kernel, because + * the kernel uses skb's, for the accelerated network performance + * This is the user port version + * + * allocate the eager TID buffers and program them into infinipath + * They are no longer completely contiguous, we do multiple + * alloc_pages() calls. + */ +static int ipath_create_user_egr(ipath_portdata * pd) +{ + char *buf; + ipath_devdata *dd = &devdata[pd->port_unit]; + uint64_t *egrbase, egroff, lenvalid; + unsigned e, egrcnt, alloced, order, egrperchunk, chunk; + unsigned long pa, pent; + + egrcnt = dd->ipath_rcvegrcnt; + egroff = + dd->ipath_rcvegrbase + pd->port_port * egrcnt * sizeof(*egrbase); + egrbase = (uint64_t *) ((char *)(dd->ipath_kregbase) + egroff); + _IPATH_VDBG("Allocating %d egr buffers, at chip offset %llx (%p)\n", + egrcnt, egroff, egrbase); + + /* + * to avoid wasting a lot of memory, we allocate 32KB chunks of + * physically contiguous memory, advance through it until used up + * and then allocate more. Of course, we need memory to store + * those extra pointers, now. Started out with 256KB, but under + * heavy memory pressure (creating large files and then copying + * them over NFS while doing lots of MPI jobs), we hit some + * alloc_pages() failures, even though we can sleep... (2.6.10) + * Still get failures at 64K. 32K is the lowest we can go without + * waiting more memory again. It seems likely that the coalescing + * in free_pages, etc. still has issues (as it has had previously + * during 2.6.x development). + */ + order = get_order(0x8000); + alloced = + round_up(dd->ipath_rcvegrbufsize * egrcnt, + (1 << order) * PAGE_SIZE); + egrperchunk = ((1 << order) * PAGE_SIZE) / dd->ipath_rcvegrbufsize; + chunk = (egrcnt + egrperchunk - 1) / egrperchunk; + pd->port_rcvegrbuf_chunks = chunk; + pd->port_rcvegrbufs_perchunk = egrperchunk; + pd->port_rcvegrbuf_order = order; + pd->port_rcvegrbuf_pages = + vmalloc(chunk * sizeof(pd->port_rcvegrbuf_pages[0])); + pd->port_rcvegrbuf_virt = + vmalloc(chunk * sizeof(pd->port_rcvegrbuf_virt[0])); + if (!pd->port_rcvegrbuf_pages || !pd->port_rcvegrbuf_pages) { + _IPATH_UNIT_ERROR(pd->port_unit, + "Unable to allocate %u EGR buffer array pointers\n", + chunk); + if (pd->port_rcvegrbuf_pages) { + vfree(pd->port_rcvegrbuf_pages); + pd->port_rcvegrbuf_pages = NULL; + } + return -ENOMEM; + } + for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) { + /* + * GFP_USER, but without GFP_FS, so buffer cache can + * be coalesced (we hope); otherwise, even at order 4, heavy + * filesystem activity makes these fail + */ + if (! + (pd->port_rcvegrbuf_pages[e] = + alloc_pages(__GFP_WAIT | __GFP_IO, order))) { + _IPATH_UNIT_ERROR(pd->port_unit, + "Unable to allocate EGR buffer array %u/%u\n", + e, pd->port_rcvegrbuf_chunks); + vfree(pd->port_rcvegrbuf_pages); + pd->port_rcvegrbuf_pages = NULL; + vfree(pd->port_rcvegrbuf_virt); + pd->port_rcvegrbuf_virt = NULL; + return -ENOMEM; + } + } + + /* + * calculate physical, then phys_to_virt() + * so that we get an address that fits in 64 bits, so we can use + * mmap64 from 32 bit programs on the chip and kernel virtual + * addresses (mmap64 for 32 bit programs on i386 and x86_64 + * only has 44 bits of address, because it uses mmap2()) + * We do this with the first chunk; We don't need a kernel + * virtually contiguous address to give the user virtually + * contiguous mappings. It just complicates the nopage routine + * a little tiny bit ;) + */ + buf = page_address(pd->port_rcvegrbuf_pages[0]); + pa = virt_to_phys(buf); + pd->port_rcvegr_phys = pa; + + /* in words */ + lenvalid = (dd->ipath_rcvegrbufsize - pd->port_egrskip) >> 2; + _IPATH_VDBG + ("port%u egrbuf vaddr %p, cpu %d, egrskip %u, len %llx words\n", + pd->port_port, buf, smp_processor_id(), pd->port_egrskip, + lenvalid); + lenvalid <<= INFINIPATH_RT_BUFSIZE_SHIFT; + lenvalid |= INFINIPATH_RT_VALID; + + for (e = chunk = 0; chunk < pd->port_rcvegrbuf_chunks; chunk++) { + int i, n; + struct page *p; + p = pd->port_rcvegrbuf_pages[chunk]; + pa = page_to_phys(p); + buf = page_address(p); + /* + * stash away for later use, since page_address() lookup + * is not cheap + */ + pd->port_rcvegrbuf_virt[chunk] = buf; + if (pa & ~INFINIPATH_RT_ADDR_MASK) + _IPATH_INFO + ("physaddr %lx has more than 40 bits, using only 40!\n", + pa); + n = 1 << pd->port_rcvegrbuf_order; + for (i = 0; i < n; i++) + SetPageReserved(virt_to_page(buf + (i * PAGE_SIZE))); + + /* clear buffer for security, sanity, and, debugging */ + memset(buf, 0, PAGE_SIZE * n); + + for (i = 0; e < egrcnt && i < egrperchunk; e++, i++) { + pent = + ((pa + + pd-> + port_egrskip) & INFINIPATH_RT_ADDR_MASK) | + lenvalid; + + ipath_kput_memq(pd->port_unit, &egrbase[e], pent); + _IPATH_VDBG("egr %u phys %lx val %lx\n", e, pa, pent); + pa += dd->ipath_rcvegrbufsize; + } + yield(); /* don't hog the cpu */ + } + + return 0; +} + +/* + * This routine is now quite different for user and kernel, because + * the kernel uses skb's, for the accelerated network performance + * This is the kernel (port0) version + * + * Allocate the eager TID buffers and program them into infinipath. + * We use the network layer alloc_skb() allocator to allocate the memory, and + * either use the buffers as is for things like SMA packets, or pass + * the buffers up to the ipath layered driver and thence the network layer, + * replacing them as we do so (see ipath_kreceive()) + */ +static int ipath_create_port0_egr(ipath_portdata * pd) +{ + int ret = 0; + uint64_t *egrbase, egroff; + unsigned e, egrcnt; + ipath_devdata *dd; + struct sk_buff **skbs; + + dd = &devdata[pd->port_unit]; + egrcnt = dd->ipath_rcvegrcnt; + egroff = + dd->ipath_rcvegrbase + pd->port_port * egrcnt * sizeof(*egrbase); + egrbase = (uint64_t *) ((char *)(dd->ipath_kregbase) + egroff); + _IPATH_VDBG + ("unit%u Allocating %d egr buffers, at chip offset %llx (%p)\n", + pd->port_unit, egrcnt, egroff, egrbase); + + skbs = vmalloc(sizeof(*dd->ipath_port0_skbs) * egrcnt); + if (skbs == NULL) + ret = -ENOMEM; + else { + for (e = 0; e < egrcnt; e++) { + /* + * This is a bit tricky in that we allocate + * extra space for 2 bytes of the 14 byte + * ethernet header. These two bytes are passed + * in the ipath header so the rest of the data + * is word aligned. We allocate 4 bytes so that the + * data buffer stays word aligned. + * See ipath_kreceive() for more details. + */ + skbs[e] = + __dev_alloc_skb(dd->ipath_ibmaxlen + 4, GFP_KERNEL); + if (skbs[e] == NULL) { + _IPATH_UNIT_ERROR(pd->port_unit, + "SKB allocation error for eager TID %u\n", + e); + while (e != 0) + dev_kfree_skb(skbs[--e]); + ret = -ENOMEM; + break; + } + skb_reserve(skbs[e], 4); + } + } + /* + * after loop above, so we can test non-NULL + * to see if ready to use at receive, etc. Hope this fixes some + * panics. + */ + dd->ipath_port0_skbs = skbs; + + /* + * have to tell chip each time we init it + * even if we are re-using previous memory. + */ + if (!ret) { + uint64_t lenvalid; /* in words */ + + lenvalid = (dd->ipath_ibmaxlen - pd->port_egrskip) >> 2; + lenvalid <<= INFINIPATH_RT_BUFSIZE_SHIFT; + lenvalid |= INFINIPATH_RT_VALID; + for (e = 0; e < egrcnt; e++) { + unsigned long pa, pent; + + pa = virt_to_phys(dd->ipath_port0_skbs[e]->data); + pa += pd->port_egrskip; + if (!e && (pa & ~INFINIPATH_RT_ADDR_MASK)) + _IPATH_INFO + ("phys addr %lx has more than 40 bits, using only 40!!!\n", + pa); + pent = (pa & INFINIPATH_RT_ADDR_MASK) | lenvalid; + /* + * don't need this except extreme debugging, + * but leaving to save future typing. + * _IPATH_VDBG("egr[%d] %p <- %lx\n", e, &egrbase[e], pent); + */ + ipath_kput_memq(pd->port_unit, &egrbase[e], pent); + } + yield(); /* don't hog the cpu */ + } + + return ret; +} + +/* + * this *must* be physically contiguous memory, and for now, + * that limits it to what kmalloc can do. + */ +static int ipath_create_rcvhdrq(ipath_portdata * pd) +{ + int i, ret = 0, amt, order, pgs; + char *qt; + struct page *p; + unsigned long pa, pa0; + + amt = round_up(devdata[pd->port_unit].ipath_rcvhdrcnt + * devdata[pd->port_unit].ipath_rcvhdrentsize * + sizeof(uint32_t), PAGE_SIZE); + if (!pd->port_rcvhdrq) { + order = get_order(amt); + /* + * not using REPEAT isn't viable; at 128KB, we can easily fail + * this. The problem with REPEAT is we can block here + * "forever". There isn't an inbetween, unfortunately. + * We could reduce the risk by never freeing the rcvhdrq + * except at unload, but even then, the first time a + * port is used, we could delay for some time... + */ + p = alloc_pages(GFP_USER, order); + if (!p) { + _IPATH_UNIT_ERROR(pd->port_unit, + "attempt to allocate order %u memory for port %u rcvhdrq failed\n", + order, pd->port_port); + return -ENOMEM; + } + + /* + * should use kmap (and later kunmap), even though high mem will + * always be mapped on x86_64, to play it safe, but for some + * bizarre reason these aren't exported symbols... + */ + pd->port_rcvhdrq = page_address(p); + if (!virt_addr_valid(pd->port_rcvhdrq)) { + _IPATH_DBG + ("weird, virt_addr_valid false right after alloc_pages\n"); + _IPATH_DBG("__pa(%p) is %lx, num_physpages %lx\n", + pd->port_rcvhdrq, __pa(pd->port_rcvhdrq), + num_physpages); + } + pd->port_rcvhdrq_phys = virt_to_phys(pd->port_rcvhdrq); + pd->port_rcvhdrq_order = order; + + pa0 = pd->port_rcvhdrq_phys; + pgs = amt >> PAGE_SHIFT; + _IPATH_VDBG + ("%d pages at %p (phys %lx) order=%u for port %u rcvhdr Q\n", + pgs, pd->port_rcvhdrq, pa0, pd->port_rcvhdrq_order, + pd->port_port); + + /* + * verify it's really physically contiguous, to be paranoid + * also mark pages as reserved, to avoid problems when + * user process with them mapped then exits. + */ + qt = pd->port_rcvhdrq; + SetPageReserved(virt_to_page(qt)); + qt += PAGE_SIZE; + for (pa = pa0, i = 1; i < pgs; i++, qt += PAGE_SIZE) { + SetPageReserved(virt_to_page(qt)); + pa = virt_to_phys(qt); + if (pa != (pa0 + (i * PAGE_SIZE))) + _IPATH_INFO + ("pg %d at %p phys %lx not contiguous\n", i, + qt, pa); + else + _IPATH_VDBG("pg %d at %p phys %lx\n", i, qt, + pa); + } + } + + /* + * clear for security, sanity, and/or debugging (each time we + * use/reuse) + */ + memset(pd->port_rcvhdrq, 0, amt); + + /* + * tell chip each time we init it, even if we are re-using previous + * memory (we zero it at process close) + */ + _IPATH_VDBG("writing port %d rcvhdraddr as %lx\n", pd->port_port, + pd->port_rcvhdrq_phys); + ipath_kput_kreg_port(pd->port_unit, kr_rcvhdraddr, pd->port_port, + pd->port_rcvhdrq_phys); + + return ret; +} + +#ifdef _IPATH_EXTRA_DEBUG +/* + * occasionally useful to dump the full set of kernel registers for debugging. + */ +static void ipath_dump_allregs(char *what, ipath_type t) +{ + uint16_t reg; + _IPATH_DBG("%s\n", what); + for (reg = 0; reg <= 0x100; reg++) { + uint64_t v = ipath_kget_kreg64(t, reg); + if (!(reg % 4)) + printk("\n%3x: ", reg); + printk("%16llx ", v); + } + printk("\n"); +} +#endif /* _IPATH_EXTRA_DEBUG */ + +/* + * Do the actual initialization sequence on the chip. For the real + * hardware, this is done from the init routine called from the PCI + * infrastructure. + */ +int ipath_init_chip(const ipath_type t) +{ + int ret = 0, i; + uint32_t val32, kpiobufs; + uint64_t val, atmp; + volatile uint32_t *piobuf; + uint32_t pioincr; + ipath_devdata *dd = &devdata[t]; + ipath_portdata *pd; + struct page *vpage; + char boardn[32]; + + /* first time only, set after static version info */ + if (!chip_driver_version) { + i = strlen(ipath_core_version); + chip_driver_version = ipath_core_version + i; + chip_driver_size = sizeof ipath_core_version - i; + } + + /* + * have to clear shadow copies of registers at init that are not + * otherwise set here, or all kinds of bizarre things happen with + * driver on chip reset + */ + dd->ipath_rcvhdrsize = 0; + + /* + * don't clear ipath_flags as 8bit mode was set before entering + * this func. However, we do set the linkstate to unknown + */ + + /* so we can watch for a transition */ + dd->ipath_flags |= IPATH_LINKUNK; + dd->ipath_flags &= ~(IPATH_LINKACTIVE | IPATH_LINKARMED | IPATH_LINKDOWN + | IPATH_LINKINIT); + + _IPATH_VDBG("Try to read spc chip revision\n"); + dd->ipath_revision = ipath_kget_kreg64(t, kr_revision); + + /* + * set up fundamental info we need to use the chip; we assume if + * the revision reg and these regs are OK, we don't need to special + * case the rest + */ + dd->ipath_sregbase = ipath_kget_kreg32(t, kr_sendregbase); + dd->ipath_cregbase = ipath_kget_kreg32(t, kr_counterregbase); + dd->ipath_uregbase = ipath_kget_kreg32(t, kr_userregbase); + _IPATH_VDBG("ipath_kregbase %p, sendbase %x usrbase %x, cntrbase %x\n", + dd->ipath_kregbase, dd->ipath_sregbase, dd->ipath_uregbase, + dd->ipath_cregbase); + if ((dd->ipath_revision & 0xffffffff) == 0xffffffff || + (dd->ipath_sregbase & 0xffffffff) == 0xffffffff || + (dd->ipath_cregbase & 0xffffffff) == 0xffffffff || + (dd->ipath_uregbase & 0xffffffff) == 0xffffffff) { + _IPATH_UNIT_ERROR(t, + "Register read failures from chip, giving up initialization\n"); + ret = -ENODEV; + goto done; + } + + /* clear the initial reset flag, in case first driver load */ + ipath_kput_kreg(t, kr_errorclear, INFINIPATH_E_RESET); + + dd->ipath_portcnt = ipath_kget_kreg32(t, kr_portcnt); + if (!infinipath_cfgports) + dd->ipath_cfgports = dd->ipath_portcnt; + else if (infinipath_cfgports <= dd->ipath_portcnt) { + dd->ipath_cfgports = infinipath_cfgports; + _IPATH_DBG("Configured to use %u ports out of %u in chip\n", + dd->ipath_cfgports, dd->ipath_portcnt); + } else { + dd->ipath_cfgports = dd->ipath_portcnt; + _IPATH_DBG + ("Tried to configured to use %u ports; chip only supports %u\n", + infinipath_cfgports, dd->ipath_portcnt); + } + dd->ipath_pd = kmalloc(sizeof(*dd->ipath_pd) * dd->ipath_cfgports, + GFP_KERNEL); + if (!dd->ipath_pd) { + _IPATH_UNIT_ERROR(t, + "Unable to allocate portdata array, failing\n"); + ret = -ENOMEM; + goto done; + } + memset(dd->ipath_pd, 0, sizeof(*dd->ipath_pd) * dd->ipath_cfgports); + + dd->ipath_lastegrheads = kmalloc(sizeof(*dd->ipath_lastegrheads) + * dd->ipath_cfgports, GFP_KERNEL); + dd->ipath_lastrcvhdrqtails = kmalloc(sizeof(*dd->ipath_lastrcvhdrqtails) + * dd->ipath_cfgports, GFP_KERNEL); + if (!dd->ipath_lastegrheads || !dd->ipath_lastrcvhdrqtails) { + _IPATH_UNIT_ERROR(t, + "Unable to allocate head arrays, failing\n"); + ret = -ENOMEM; + goto done; + } + memset(dd->ipath_lastrcvhdrqtails, 0, + sizeof(*dd->ipath_lastrcvhdrqtails) + * dd->ipath_cfgports); + memset(dd->ipath_lastegrheads, 0, sizeof(*dd->ipath_lastegrheads) + * dd->ipath_cfgports); + + dd->ipath_pd[0] = kmalloc(sizeof(ipath_portdata), GFP_KERNEL); + if (!dd->ipath_pd[0]) { + _IPATH_UNIT_ERROR(t, + "Unable to allocate portdata for port 0, failing\n"); + ret = -ENOMEM; + goto done; + } + memset(dd->ipath_pd[0], 0, sizeof(ipath_portdata)); + + pd = dd->ipath_pd[0]; + pd->port_unit = t; + pd->port_port = 0; + pd->port_cnt = 1; + /* The port 0 pkey table is used by the layer interface. */ + pd->port_pkeys[0] = IPS_DEFAULT_P_KEY; + + dd->ipath_rcvtidcnt = ipath_kget_kreg32(t, kr_rcvtidcnt); + dd->ipath_rcvtidbase = ipath_kget_kreg32(t, kr_rcvtidbase); + dd->ipath_rcvegrcnt = ipath_kget_kreg32(t, kr_rcvegrcnt); + dd->ipath_rcvegrbase = ipath_kget_kreg32(t, kr_rcvegrbase); + dd->ipath_palign = ipath_kget_kreg32(t, kr_pagealign); + dd->ipath_piobufbase = ipath_kget_kreg32(t, kr_sendpiobufbase); + dd->ipath_piosize = ipath_kget_kreg32(t, kr_sendpiosize); + dd->ipath_ibmtu = 4096; /* default to largest legal MTU */ + dd->ipath_piobcnt = ipath_kget_kreg32(t, kr_sendpiobufcnt); + + _IPATH_VDBG + ("Revision %llx (PCI %x), %u ports, %u tids, %u egrtids, %u piobufs\n", + dd->ipath_revision, dd->ipath_pcirev, dd->ipath_portcnt, + dd->ipath_rcvtidcnt, dd->ipath_rcvegrcnt, dd->ipath_piobcnt); + + if (((dd->ipath_revision >> INFINIPATH_R_SOFTWARE_SHIFT) & INFINIPATH_R_SOFTWARE_MASK) != IPATH_CHIP_SWVERSION) { /* >= maybe, someday */ + _IPATH_UNIT_ERROR(t, + "Driver only handles version %d, chip swversion is %d (%llx), failng\n", + IPATH_CHIP_SWVERSION, + (int)(dd-> + ipath_revision >> + INFINIPATH_R_SOFTWARE_SHIFT) & + INFINIPATH_R_SOFTWARE_MASK, + dd->ipath_revision); + ret = -ENOSYS; + goto done; + } + dd->ipath_majrev = (uint8_t) ((dd->ipath_revision >> + INFINIPATH_R_CHIPREVMAJOR_SHIFT) & + INFINIPATH_R_CHIPREVMAJOR_MASK); + dd->ipath_minrev = + (uint8_t) ((dd-> + ipath_revision >> INFINIPATH_R_CHIPREVMINOR_SHIFT) & + INFINIPATH_R_CHIPREVMINOR_MASK); + dd->ipath_boardrev = + (uint8_t) ((dd-> + ipath_revision >> INFINIPATH_R_BOARDID_SHIFT) & + INFINIPATH_R_BOARDID_MASK); + + ipath_get_boardname(t, boardn, sizeof boardn); + + { + snprintf(chip_driver_version, chip_driver_size, + "Driver %u.%u, %s, InfiniPath%u %u.%u, PCI %u, SW Compat %u\n", + IPATH_CHIP_VERS_MAJ, IPATH_CHIP_VERS_MIN, boardn, + (unsigned)(dd-> + ipath_revision >> INFINIPATH_R_ARCH_SHIFT) & + INFINIPATH_R_ARCH_MASK, dd->ipath_majrev, + dd->ipath_minrev, dd->ipath_pcirev, + (unsigned)(dd-> + ipath_revision >> + INFINIPATH_R_SOFTWARE_SHIFT) & + INFINIPATH_R_SOFTWARE_MASK); + + } + + _IPATH_DBG("%s", chip_driver_version); + + /* + * we ignore most issues after reporting them, but have to specially + * handle hardware-disabled chips. + */ + if(ipath_validate_rev(dd) == 2) { + ret = -EPERM; /* unique error, known to infinipath_init_one() */ + goto done; + } + + /* + * zero all the TID entries at startup. We do this for sanity, + * in case of a previous driver crash of some kind, and also + * because the chip powers up with these memories in an unknown + * state. Use portcnt, not cfgports, since this is for the full chip, + * not for current (possibly different) configuration value + * Chip Errata bug 6447 + */ + for (val32 = 0; val32 < dd->ipath_portcnt; val32++) + ipath_clear_tids(t, val32); + + dd->ipath_rcvhdrentsize = IPATH_RCVHDRENTSIZE; + /* we could bump this + * to allow for full rcvegrcnt + rcvtidcnt, but then it no + * longer nicely fits power of two, and since we now use + * alloc_pages, the rest would be wasted. + */ + dd->ipath_rcvhdrcnt = dd->ipath_rcvegrcnt; + /* + * setup offset of last valid entry in rcvhdrq, for various tests, to + * avoid calculating each time we need it + */ + dd->ipath_hdrqlast = + dd->ipath_rcvhdrentsize * (dd->ipath_rcvhdrcnt - 1); + ipath_kput_kreg(t, kr_rcvhdrentsize, dd->ipath_rcvhdrentsize); + ipath_kput_kreg(t, kr_rcvhdrcnt, dd->ipath_rcvhdrcnt); + /* + * not in ipath_rcvhdrsize, so user programs can set differently, but + * so any early packets see the default size. + */ + ipath_kput_kreg(t, kr_rcvhdrsize, IPATH_DFLT_RCVHDRSIZE); + + /* + * we "know" that this works + * out OK. It's actually a bit more than we need, but 2048+64 isn't + * quite enough for full size, and we want the +N to be a power of 2 + * to give us reasonable alignment and fit within page_alloc()'ed + * memory + */ + dd->ipath_rcvegrbufsize = dd->ipath_piosize; + + /* + * the min() check here is currently a nop, but it may not always be, + * depending on just how we do ipath_rcvegrbufsize + */ + dd->ipath_ibmaxlen = min(dd->ipath_piosize, dd->ipath_rcvegrbufsize); + dd->ipath_init_ibmaxlen = dd->ipath_ibmaxlen; + + /* + * set up the shadow copies of the piobufavail registers, which + * we compare against the chip registers for now, and the in + * memory DMA'ed copies of the registers. This has to be done + * early, before we calculate lastport, etc. + */ + val = dd->ipath_piobcnt; + /* + * calc number of pioavail registers, and save it; we have 2 bits + * per buffer + */ + dd->ipath_pioavregs = + round_up(val, sizeof(uint64_t) * _BITS_PER_BYTE / 2) / + (sizeof(uint64_t) * _BITS_PER_BYTE / 2); + if (dd->ipath_pioavregs > + (sizeof(dd->ipath_pioavailshadow) / + sizeof(dd->ipath_pioavailshadow[0]))) { + dd->ipath_pioavregs = + sizeof(dd->ipath_pioavailshadow) / + sizeof(dd->ipath_pioavailshadow[0]); + dd->ipath_piobcnt = dd->ipath_pioavregs * sizeof(uint64_t) * _BITS_PER_BYTE >> 1; /* 2 bits/reg */ + _IPATH_INFO + ("Warning: %lld piobufs is too many to fit in shadow, only using %d\n", + val, dd->ipath_piobcnt); + } + + if (!infinipath_kpiobufs) { + /* have to have at least one, for SMA */ + kpiobufs = infinipath_kpiobufs = 1; + } else if (dd->ipath_piobcnt < + (dd->ipath_cfgports * IPATH_MIN_USER_PORT_BUFCNT)) { + _IPATH_INFO + ("Too few PIO buffers (%u) for %u ports to have %u each!\n", + dd->ipath_piobcnt, dd->ipath_cfgports, + IPATH_MIN_USER_PORT_BUFCNT); + kpiobufs = 1; /* reserve just the minimum for SMA/ether */ + } else + kpiobufs = infinipath_kpiobufs; + + if (kpiobufs > + (dd->ipath_piobcnt - + (dd->ipath_cfgports * IPATH_MIN_USER_PORT_BUFCNT))) { + i = dd->ipath_piobcnt - + (dd->ipath_cfgports * IPATH_MIN_USER_PORT_BUFCNT); + if (i < 0) + i = 0; + _IPATH_INFO + ("Allocating %d PIO bufs for kernel leaves too few for %d user ports (%d each); using %u\n", + kpiobufs, dd->ipath_cfgports - 1, + IPATH_MIN_USER_PORT_BUFCNT, i); + /* + * shouldn't change infinipath_kpiobufs, because could be + * different for different devices... + */ + kpiobufs = i; + } + dd->ipath_lastport_piobuf = dd->ipath_piobcnt - kpiobufs; + dd->ipath_pbufsport = dd->ipath_cfgports > 1 ? + dd->ipath_lastport_piobuf / (dd->ipath_cfgports - 1) : 0; + val32 = dd->ipath_lastport_piobuf - + (dd->ipath_pbufsport * (dd->ipath_cfgports - 1)); + if (val32 > 0) { + _IPATH_DBG + ("allocating %u pbufs/port leaves %u unused, add to kernel\n", + dd->ipath_pbufsport, val32); + dd->ipath_lastport_piobuf -= val32; + _IPATH_DBG("%u pbufs/port leaves %u unused, add to kernel\n", + dd->ipath_pbufsport, val32); + } + dd->ipath_lastpioindex = dd->ipath_lastport_piobuf; + _IPATH_VDBG + ("%d PIO bufs %u - %u, %u each for %u user ports\n", + kpiobufs, dd->ipath_lastport_piobuf, dd->ipath_piobcnt, dd->ipath_pbufsport, + dd->ipath_cfgports - 1); + + /* + * this has to be page aligned, and on a page of it's own, so we + * can map it into user space. We also use it to give processes + * a copy of ipath_statusp, on a separate cacheline, followed by + * a copy of the freeze error string, if it's happened. Might also + * use that space for other things. + */ + val = round_up(2 * L1_CACHE_BYTES + sizeof(*dd->ipath_statusp) + + dd->ipath_pioavregs * sizeof(uint64_t), 2 * PAGE_SIZE); + if (!(dd->ipath_pioavailregs_dma = kmalloc(val * sizeof(uint64_t), + GFP_KERNEL))) { + _IPATH_UNIT_ERROR(t, + "failed to allocate PIOavail reg area in memory\n"); + ret = -ENOMEM; + goto done; + } + if ((PAGE_SIZE - 1) & (uint64_t) dd->ipath_pioavailregs_dma) { + dd->__ipath_pioavailregs_base = dd->ipath_pioavailregs_dma; + dd->ipath_pioavailregs_dma = (uint64_t *) + round_up((uint64_t) dd->ipath_pioavailregs_dma, PAGE_SIZE); + } else + dd->__ipath_pioavailregs_base = dd->ipath_pioavailregs_dma; + /* + * zero initial, since whole thing mapped + * into user space, and don't want info leak, or confusing garbage + */ + memset((void *)dd->ipath_pioavailregs_dma, 0, PAGE_SIZE); + + /* + * we really want L2 cache aligned, but for current CPUs of interest, + * they are the same. + */ + dd->ipath_statusp = (uint64_t *) ((char *)dd->ipath_pioavailregs_dma + + ((2 * L1_CACHE_BYTES + + dd->ipath_pioavregs * + sizeof(uint64_t)) & + ~L1_CACHE_BYTES)); + /* copy the current value now that it's really allocated */ + *dd->ipath_statusp = dd->_ipath_status; + /* + * setup buffer to hold freeze msg, accessible to apps, following + * statusp + */ + dd->ipath_freezemsg = (char *)&dd->ipath_statusp[1]; + /* and it's length */ + dd->ipath_freezelen = L1_CACHE_BYTES - sizeof(dd->ipath_statusp[0]); + + atmp = virt_to_phys(dd->ipath_pioavailregs_dma); + /* stash physical address for user progs */ + dd->ipath_pioavailregs_phys = atmp; + (void)ipath_kput_kreg(t, kr_sendpioavailaddr, atmp); + /* + * this is to detect s/w errors, which the h/w works around by + * ignoring the low 6 bits of address, if it wasn't aligned. + */ + val = ipath_kget_kreg64(t, kr_sendpioavailaddr); + if (val != atmp) { + _IPATH_UNIT_ERROR(t, + "Catastrophic software error, SendPIOAvailAddr written as %llx, read back as %llx\n", + atmp, val); + ret = -EINVAL; + goto done; + } + + if (t * 64 > (sizeof(ipath_port0_rcvhdrtail) - 64)) { + _IPATH_UNIT_ERROR(t, + "unit %u too large for port 0 rcvhdrtail buffer size\n", + t); + ret = -ENODEV; + } + + /* + * kernel modules loaded into vmalloc'ed memory, + * verify that when we assume that, map to phys, and back to virt, + * that we get the right contents, so we did the mapping right. + */ + vpage = vmalloc_to_page((void *)ipath_port0_rcvhdrtail); + if (vpage == NOPAGE_SIGBUS || vpage == NOPAGE_OOM) { + _IPATH_UNIT_ERROR(t, "vmalloc_to_page for rcvhdrtail fails!\n"); + ret = -ENOMEM; + goto done; + } + + /* + * 64 is driven by cache line size, and also by chip requirement + * that low 6 bits be 0 + */ + val = page_to_phys(vpage) + t * 64; + + /* verify that the alignment requirement was met */ + ipath_kput_kreg_port(t, kr_rcvhdrtailaddr, 0, val); + atmp = ipath_kget_kreg64_port(t, kr_rcvhdrtailaddr, 0); + if (val != atmp) { + _IPATH_UNIT_ERROR(t, + "Catastrophic software error, RcvHdrTailAddr0 written as %llx, read back as %llx from %x\n", + val, atmp, kr_rcvhdrtailaddr); + ret = -EINVAL; + goto done; + } + /* so we can get current tail in ipath_kreceive(), per chip */ + dd->ipath_hdrqtailptr = + &ipath_port0_rcvhdrtail[t * + (64 / sizeof(ipath_port0_rcvhdrtail[0]))]; + + ipath_kput_kreg(t, kr_rcvbthqp, IPATH_KD_QP); + + /* + * make sure we are not in freeze, and PIO send enabled, so + * writes to pbc happen + */ + ipath_kput_kreg(t, kr_hwerrmask, 0ULL); + ipath_kput_kreg(t, kr_hwerrclear, ~0ULL); + ipath_kput_kreg(t, kr_control, 0ULL); + ipath_kput_kreg(t, kr_sendctrl, INFINIPATH_S_PIOENABLE); + + /* + * write the pbc of each buffer, to be sure it's initialized, then + * cancel all the buffers, and also abort any packets that might + * have been in flight for some reason (the latter is for driver + * unload/reload, but isn't a bad idea at first init). + * PIO send isn't enabled at this point, so there is no danger + * of sending these out on the wire. + * Chip Errata bug 6610 + */ + piobuf = (uint32_t *) (((char *)(dd->ipath_kregbase)) + + dd->ipath_piobufbase); + pioincr = devdata[t].ipath_palign / sizeof(*piobuf); + for (i = 0; i < dd->ipath_piobcnt; i++) { + *piobuf = 16; /* reasonable word count, just to init pbc */ + piobuf += pioincr; + } + /* self-clearing */ + ipath_kput_kreg(t, kr_sendctrl, INFINIPATH_S_ABORT); + + /* + * before error clears, since we expect serdes pll errors during + * this, the first time after reset + */ + if (ipath_bringup_link(t)) { + _IPATH_INFO("Failed to bringup IB link\n"); + ret = -ENETDOWN; + goto done; + } + + /* + * clear any "expected" hwerrs from reset and/or initialization + * clear any that aren't enabled (at least this once), and then + * set the enable mask + */ + ipath_clear_init_hwerrs(t); + ipath_kput_kreg(t, kr_hwerrclear, ~0ULL); + ipath_kput_kreg(t, kr_hwerrmask, dd->ipath_hwerrmask); + + dd->ipath_maskederrs = dd->ipath_ignorederrs; + ipath_kput_kreg(t, kr_errorclear, ~0ULL); /* clear all */ + /* enable errors that are masked, at least this first time. */ + ipath_kput_kreg(t, kr_errormask, ~dd->ipath_maskederrs); + /* clear any interrups up to this point (ints still not enabled) */ + ipath_kput_kreg(t, kr_intclear, ~0ULL); + + ipath_stats.sps_lid[t] = dd->ipath_lid; + + /* + * allocate the shadow TID array, so we can ipath_munlock + * previous entries. It make make more sense to move the pageshadow + * to the port data structure, so we only allocate memory for ports + * actually in use, since we at 8k per port, now + */ + dd->ipath_pageshadow = (struct page **) + vmalloc(dd->ipath_cfgports * dd->ipath_rcvtidcnt * + sizeof(struct page *)); + if (!dd->ipath_pageshadow) + _IPATH_UNIT_ERROR(t, + "failed to allocate shadow page * array, no expected sends!\n"); + else + memset(dd->ipath_pageshadow, 0, + dd->ipath_cfgports * dd->ipath_rcvtidcnt * + sizeof(struct page *)); + + /* set up the port 0 (kernel) rcvhdr q and egr TIDs */ + if (!(ret = ipath_create_rcvhdrq(dd->ipath_pd[0]))) + ret = ipath_create_port0_egr(dd->ipath_pd[0]); + if (ret) + _IPATH_UNIT_ERROR(t, + "failed to allocate port 0 (kernel) rcvhdrq and/or egr bufs\n"); + else { + init_waitqueue_head(&ipath_sma_wait); + init_waitqueue_head(&ipath_sma_state_wait); + + ipath_kput_kreg(pd->port_unit, kr_rcvctrl, dd->ipath_rcvctrl); + + ipath_kput_kreg(t, kr_rcvbthqp, IPATH_KD_QP); + + /* Enable PIO send, and update of PIOavail regs to memory. */ + dd->ipath_sendctrl = INFINIPATH_S_PIOENABLE + | INFINIPATH_S_PIOBUFAVAILUPD; + ipath_kput_kreg(t, kr_sendctrl, dd->ipath_sendctrl); + + /* + * enable port 0 receive, and receive interrupt + * other ports done as user opens and inits them + */ + dd->ipath_rcvctrl = INFINIPATH_R_TAILUPD | + (1ULL << INFINIPATH_R_PORTENABLE_SHIFT) | + (1ULL << INFINIPATH_R_INTRAVAIL_SHIFT); + ipath_kput_kreg(t, kr_rcvctrl, dd->ipath_rcvctrl); + + /* + * now ready for use + * this should be cleared whenever we detect a reset, or + * initiate one. + */ + dd->ipath_flags |= IPATH_INITTED; + + /* + * init our shadow copies of head from tail values, and write + * head values to match + */ + val32 = ipath_kget_ureg32(t, ur_rcvegrindextail, 0); + (void)ipath_kput_ureg(t, ur_rcvegrindexhead, val32, 0); + dd->ipath_port0head = ipath_kget_ureg32(t, ur_rcvhdrtail, 0); + (void)ipath_kput_ureg(t, ur_rcvhdrhead, dd->ipath_port0head, 0); + + /* + * by now pioavail updates to memory should have occurred, + * so copy them into our working/shadow registers; this is + * in case something went wrong with abort, but mostly to + * get the initial values of the generation bit correct + */ + for (i = 0; i < dd->ipath_pioavregs; i++) { + /* + * Chip Errata bug 6641; even and odd qwords>3 + * are swapped + */ + if (i > 3) { + if (i & 1) + dd->ipath_pioavailshadow[i] = + dd->ipath_pioavailregs_dma[i - 1]; + else + dd->ipath_pioavailshadow[i] = + dd->ipath_pioavailregs_dma[i + 1]; + } else + dd->ipath_pioavailshadow[i] = + dd->ipath_pioavailregs_dma[i]; + } + /* can get counters, stats, etc. */ + dd->ipath_flags |= IPATH_PRESENT; + } + + /* + * cause retrigger of pending interrupts ignored during init, even if + * we had errors + */ + ipath_kput_kreg(t, kr_intclear, 0ULL); + + /* + * set up stats retrieval timer, even if we had errors in last + * portion of setup + */ + init_timer(&dd->ipath_stats_timer); + dd->ipath_stats_timer.function = ipath_get_faststats; + dd->ipath_stats_timer.data = (unsigned long)t; + /* every 5 seconds; */ + dd->ipath_stats_timer.expires = jiffies + 5 * HZ; + /* takes ~16 seconds to overflow at full IB 4x bandwdith */ + add_timer(&dd->ipath_stats_timer); + + dd->ipath_stats_timer_active = 1; + +done: + if (!ret) { + ipath_get_guid(t); + *dd->ipath_statusp |= IPATH_STATUS_CHIP_PRESENT; + if (!ipath_sma_data_spare) { + /* first init, setup SMA data structs */ + ipath_sma_data_spare = + ipath_sma_data_bufs[IPATH_NUM_SMAPKTS]; + for (i = 0; i < IPATH_NUM_SMAPKTS; i++) + ipath_sma_data[i].buf = ipath_sma_data_bufs[i]; + } + /* + * sps_nports is a global, so, we set it to the highest + * number of ports of any of the chips we find; we never + * decrement it, at least for now. + */ + if (dd->ipath_cfgports > ipath_stats.sps_nports) + ipath_stats.sps_nports = dd->ipath_cfgports; + } + /* if ret is non-zero, we probably should do some cleanup here... */ + return ret; +} + +int ipath_waitfor_complete(const ipath_type t, ipath_kreg reg_id, + uint64_t bits_to_wait_for, uint64_t * valp) +{ + uint64_t timeout, lastval, val; + + lastval = ipath_kget_kreg64(t, reg_id); + timeout = get_cycles() + 0x10000000ULL; /* <- ridiculously long time */ + do { + val = ipath_kget_kreg64(t, reg_id); + *valp = val; /* so they have something, even on failures. */ + if ((val & bits_to_wait_for) == bits_to_wait_for) + return 0; + if (val != lastval) + _IPATH_VDBG + ("Changed from %llx to %llx, waiting for %llx bits\n", + lastval, val, bits_to_wait_for); + yield(); + if (get_cycles() > timeout) { + _IPATH_DBG + ("Didn't get bits %llx in register 0x%x, got %llx\n", + bits_to_wait_for, reg_id, *valp); + return ENODEV; + } + } while (1); +} + +/* + * like ipath_waitfor_complete(), but we wait for the CMDVALID bit to go away + * indicating the last command has completed. It doesn't return data + */ +int ipath_waitfor_mdio_cmdready(const ipath_type t) +{ + uint64_t timeout; + uint64_t val; + + timeout = get_cycles() + 0x10000000ULL; /* <- ridiculously long time */ + do { + val = ipath_kget_kreg64(t, kr_mdio); + if (!(val & IPATH_MDIO_CMDVALID)) + return 0; + yield(); + if (get_cycles() > timeout) { + _IPATH_DBG("CMDVALID stuck in mdio reg? (%llx)\n", val); + return ENODEV; + } + } while (1); +} + +void ipath_set_ib_lstate(const ipath_type t, int which) +{ + ipath_devdata *dd = &devdata[t]; + char *what; + + /* + * For all cases, we'll either be setting a new value of linkcmd, or + * we want it to be NOP, so clear it here. + * Similarly, we want the linkinitcmd to be NOP for everything + * other than explictly than explictly changing linkinitcmd, + * and for that case, we want to first clear any existing bits + */ + dd->ipath_ibcctrl &= ~((INFINIPATH_IBCC_LINKCMD_MASK << + INFINIPATH_IBCC_LINKCMD_SHIFT) | + (INFINIPATH_IBCC_LINKINITCMD_MASK << + INFINIPATH_IBCC_LINKINITCMD_SHIFT)); + + if (which == INFINIPATH_IBCC_LINKCMD_INIT) { + dd->ipath_flags &= ~(IPATH_LINK_TOARMED | IPATH_LINK_TOACTIVE + | IPATH_LINK_SLEEPING); + /* so we can watch for a transition */ + dd->ipath_flags |= IPATH_LINKDOWN; + what = "INIT"; + } else if (which == INFINIPATH_IBCC_LINKCMD_ARMED) { + dd->ipath_flags |= IPATH_LINK_TOARMED; + dd->ipath_flags &= ~(IPATH_LINK_TOACTIVE | IPATH_LINK_SLEEPING); + /* + * this is mainly for loopback testing. If INITCMD is + * NOP or SLEEP, the link won't ever come up in loopback... + */ + if (! + (dd-> + ipath_flags & (IPATH_LINKINIT | IPATH_LINKARMED | + IPATH_LINKACTIVE))) { + _IPATH_SMADBG + ("going to armed, but link not yet up, set POLL\n"); + dd->ipath_ibcctrl |= + INFINIPATH_IBCC_LINKINITCMD_POLL << + INFINIPATH_IBCC_LINKINITCMD_SHIFT; + } + what = "ARMED"; + } else if (which == INFINIPATH_IBCC_LINKCMD_ACTIVE) { + dd->ipath_flags |= IPATH_LINK_TOACTIVE; + dd->ipath_flags &= ~(IPATH_LINK_TOARMED | IPATH_LINK_SLEEPING); + what = "ACTIVE"; + } else if (which & (INFINIPATH_IBCC_LINKINITCMD_MASK << INFINIPATH_IBCC_LINKINITCMD_SHIFT)) { /* down, disable, etc. */ + dd->ipath_flags &= ~(IPATH_LINK_TOARMED | IPATH_LINK_TOACTIVE); + if (((which & INFINIPATH_IBCC_LINKINITCMD_MASK) >> + INFINIPATH_IBCC_LINKINITCMD_SHIFT) == + INFINIPATH_IBCC_LINKINITCMD_SLEEP) { + dd->ipath_flags |= IPATH_LINK_SLEEPING | IPATH_LINKDOWN; + } else + dd->ipath_flags |= IPATH_LINKDOWN; + dd->ipath_ibcctrl |= + which & (INFINIPATH_IBCC_LINKINITCMD_MASK << + INFINIPATH_IBCC_LINKINITCMD_SHIFT); + what = "DOWN"; + } else { + what = "UNKNOWN"; + _IPATH_INFO("Unknown link transition requested (which=0x%x)\n", + which); + } + + dd->ipath_ibcctrl |= ((uint64_t) which & INFINIPATH_IBCC_LINKCMD_MASK) + << INFINIPATH_IBCC_LINKCMD_SHIFT; + + _IPATH_SMADBG("Trying to move unit %u to %s, current ltstate is %s\n", + t, what, ipath_ibcstatus_str[(ipath_kget_kreg64(t, kr_ibcstatus) + >> INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) + & INFINIPATH_IBCS_LINKTRAININGSTATE_MASK]); + ipath_kput_kreg(t, kr_ibcctrl, dd->ipath_ibcctrl); +} + +static int ipath_bringup_link(const ipath_type t) +{ + ipath_devdata *dd = &devdata[t]; + uint64_t val, ibc; + int ret = 0; + + dd->ipath_control &= ~INFINIPATH_C_LINKENABLE; /* hold IBC in reset */ + ipath_kput_kreg(t, kr_control, dd->ipath_control); + + /* + * Note that prior to try 14 or 15 of IB, the credit scaling + * wasn't working, because it was swapped for writes with the + * 1 bit default linkstate field + */ + + /* ignore pbc and align word */ + val = dd->ipath_piosize - 2 * sizeof(uint32_t); + /* + * for ICRC, which we only send in diag test pkt mode, and we don't + * need to worry about that for mtu + */ + val += 1; + /* + * set the IBC maxpktlength to the size of our pio buffers + * the maxpktlength is in words. This is *not* the IB data MTU + */ + ibc = (val / sizeof(uint32_t)) << INFINIPATH_IBCC_MAXPKTLEN_SHIFT; + /* in KB */ + ibc |= 0x5ULL << INFINIPATH_IBCC_FLOWCTRLWATERMARK_SHIFT; + /* how often flowctrl sent + * more or less in usecs; balance against watermark value, so that + * in theory senders always get a flow control update in time to not + * let the IB link go idle. + */ + ibc |= 0x3ULL << INFINIPATH_IBCC_FLOWCTRLPERIOD_SHIFT; + /* max error tolerance */ + ibc |= 0xfULL << INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT; + /* use "real" buffer space for */ + ibc |= 4ULL << INFINIPATH_IBCC_CREDITSCALE_SHIFT; + /* IB credit flow control. */ + ibc |= 0xfULL << INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT; + /* initially come up waiting for TS1, without sending anything. */ + dd->ipath_ibcctrl = ibc; + /* don't put linkinitcmd in ipath_ibcctrl, want that to stay a NOP */ + ibc |= + INFINIPATH_IBCC_LINKINITCMD_SLEEP << + INFINIPATH_IBCC_LINKINITCMD_SHIFT; + dd->ipath_flags |= IPATH_LINK_SLEEPING; + ipath_kput_kreg(t, kr_ibcctrl, ibc); + + ret = ipath_bringup_serdes(t); + + if (ret) + _IPATH_INFO("Could not initialize SerDes, not usable\n"); + else { + dd->ipath_control |= INFINIPATH_C_LINKENABLE; /* enable IBC */ + ipath_kput_kreg(t, kr_control, dd->ipath_control); + } + + return ret; +} + +/* + * called from ipath_shutdown_link(), and from sma doing a LINKDOWN + * Left as a separate function for historical reasons, and may want + * it to do more than just call ipath_set_ib_lstate() again sometime + * in the future. + */ +void ipath_down_link(const ipath_type t) +{ + ipath_set_ib_lstate(t, INFINIPATH_IBCC_LINKINITCMD_SLEEP << + INFINIPATH_IBCC_LINKINITCMD_SHIFT); +} + +/* + * do this when driver is being unloaded, or perhaps for diags, and + * maybe when we get an interrupt of a fatal link error that requires + * bringing the linkd down and back up + */ +static int ipath_shutdown_link(const ipath_type t) +{ + uint64_t val; + ipath_devdata *dd = &devdata[t]; + int ret = 0; + + _IPATH_DBG("Shutting down the link\n"); + ipath_down_link(t); + + /* + * we are shutting down, so tell the layered driver. We don't + * do this on just a link state change, much like ethernet, + * a cable unplug, etc. doesn't change driver state + */ + if (dd->ipath_layer.l_intr) + dd->ipath_layer.l_intr(t, IPATH_LAYER_INT_IF_DOWN); + + dd->ipath_control &= ~INFINIPATH_C_LINKENABLE; /* disable IBC */ + ipath_kput_kreg(t, kr_control, dd->ipath_control); + + *dd->ipath_statusp &= ~(IPATH_STATUS_IB_CONF | IPATH_STATUS_IB_READY); + + /* + * clear SerdesEnable and turn the leds off; do this here because + * we are unloading, so don't count on interrupts to move along + */ + + ipath_quiet_serdes(t); + val = dd->ipath_extctrl & + ~(INFINIPATH_EXTC_LEDPRIPORTGREENON | + INFINIPATH_EXTC_LEDPRIPORTYELLOWON); + dd->ipath_extctrl = val; + ipath_kput_kreg(t, kr_extctrl, val); + + if (dd->ipath_stats_timer_active) { + del_timer_sync(&dd->ipath_stats_timer); + dd->ipath_stats_timer_active = 0; + } + if (*dd->ipath_statusp & IPATH_STATUS_CHIP_PRESENT) { + /* can't do anything more with chip */ + /* needs re-init */ + *dd->ipath_statusp &= ~IPATH_STATUS_CHIP_PRESENT; + if (dd->ipath_kregbase) { + /* + * if we haven't already cleaned up before these + * are to ensure any register reads/writes "fail" + * until re-init + */ + dd->ipath_kregbase = NULL; + dd->ipath_kregvirt = NULL; + dd->ipath_uregbase = 0ULL; + dd->ipath_sregbase = 0ULL; + dd->ipath_cregbase = 0ULL; + dd->ipath_kregsize = 0; + } +#ifdef CONFIG_MTRR + if (dd->ipath_mtrr) { + _IPATH_VDBG("undoing WCCOMB on pio buffers\n"); + mtrr_del(dd->ipath_mtrr, 0, 0); + dd->ipath_mtrr = 0; + } +#endif + } + + return ret; +} + +/* + * when closing, free up any allocated data for a port, if the + * reference count goes to zero + * Note: this also frees the portdata itself! + */ +void ipath_free_pddata(ipath_devdata * dd, uint32_t port, int freehdrq) +{ + ipath_portdata *pd = dd->ipath_pd[port]; + + if (!pd) + return; + if (freehdrq) + /* + * only clear and free portdata if we are going to + * also release the hdrq, otherwise we leak the hdrq on each + * open/close cycle + */ + dd->ipath_pd[port] = NULL; + /* cleanup locked pages private data structures */ + ipath_mlock_cleanup(pd); + if (freehdrq && pd->port_rcvhdrq) { + int i, n = 1 << pd->port_rcvhdrq_order; + _IPATH_VDBG("free closed port %d rcvhdrq @ %p (order=%u)\n", + pd->port_port, pd->port_rcvhdrq, + pd->port_rcvhdrq_order); + for (i = 0; i < n; i++) + ClearPageReserved(virt_to_page + (pd->port_rcvhdrq + (i * PAGE_SIZE))); + free_pages((unsigned long)pd->port_rcvhdrq, + pd->port_rcvhdrq_order); + pd->port_rcvhdrq = NULL; + } + if (port && pd->port_rcvegrbuf_pages) { /* always free this, however */ + void *virt; + unsigned e, i, n = 1 << pd->port_rcvegrbuf_order; + if (pd->port_rcvegrbuf_virt) { + for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) { + virt = pd->port_rcvegrbuf_virt[e]; + for (i = 0; i < n; i++) + ClearPageReserved(virt_to_page + (virt + + (i * PAGE_SIZE))); + _IPATH_VDBG + ("egrbuf free_pages(%p, %x), chunk %u/%u\n", + virt, pd->port_rcvegrbuf_order, e, + pd->port_rcvegrbuf_chunks); + free_pages((unsigned long)virt, + pd->port_rcvegrbuf_order); + } + vfree(pd->port_rcvegrbuf_virt); + pd->port_rcvegrbuf_virt = NULL; + } + pd->port_rcvegrbuf_chunks = 0; + _IPATH_VDBG("free closed port %d rcvegrbufs ptr array\n", + pd->port_port); + /* now the pointer array. */ + vfree(pd->port_rcvegrbuf_pages); + pd->port_rcvegrbuf_pages = NULL; + } else if (port == 0 && dd->ipath_port0_skbs) { + unsigned e; + struct sk_buff **skbs = dd->ipath_port0_skbs; + + dd->ipath_port0_skbs = NULL; + _IPATH_VDBG("free closed port %d ipath_port0_skbs @ %p\n", + pd->port_port, skbs); + for (e = 0; e < dd->ipath_rcvegrcnt; e++) + if (skbs[e]) + dev_kfree_skb(skbs[e]); + vfree(skbs); + } + if (freehdrq) { + kfree(pd->port_tid_pg_list); + kfree(pd); + } +} + +int __init infinipath_init(void) +{ + int r = 0, i; + + _IPATH_DBG(KERN_INFO DRIVER_LOAD_MSG "%s", ipath_core_version); + + ipath_init_picotime(); /* init cycles -> pico conversion */ + + if (!ipath_ctl_header) { /* should be always */ + if (!(ipath_ctl_header = register_sysctl_table(ipath_ctl, 1))) + _IPATH_INFO("Couldn't register sysctl interface\n"); + } + + /* + * initialize the statusp to temporary storage so we can use it + * everywhere without first checking. When we "really" assign it, + * we copy from _ipath_status + */ + for (i = 0; i < infinipath_max; i++) + devdata[i].ipath_statusp = &devdata[i]._ipath_status; + + /* + * init these early, in case we take an interrupt as soon as the irq + * is setup. Saw a spinlock panic once that appeared to be due to that + * problem, when they were initted later on. + */ + spin_lock_init(&ipath_pioavail_lock); + spin_lock_init(&ipath_sma_lock); + + pci_register_driver(&infinipath_driver); + + driver_create_file(&(infinipath_driver.driver), &driver_attr_version); + + if ((r = register_chrdev(ipath_major, MODNAME, &ipath_fops))) + _IPATH_ERROR("Unable to register %s device\n", MODNAME); + + + /* + * never return an error, since we could have stuff registered, + * resources used, etc., even if no hardware found. This way we + * can clean up through unload. + */ + return 0; +} + +/* + * note: if for some reason the unload fails after this routine, and leaves + * the driver enterable by user code, we'll almost certainly crash and burn... + */ +static void __exit infinipath_cleanup(void) +{ + int r, m, port; + + driver_remove_file(&(infinipath_driver.driver), &driver_attr_version); + if (ipath_ctl_header) { + unregister_sysctl_table(ipath_ctl_header); + ipath_ctl_header = NULL; + } else + _IPATH_DBG("No sysctl unregister, not registered OK\n"); + if ((r = unregister_chrdev(ipath_major, MODNAME))) + _IPATH_DBG("unregister of device failed: %d\n", r); + + + /* + * turn off rcv, send, and interrupts for all ports, all drivers + * should also hard reset the chip here? + * free up port 0 (kernel) rcvhdr, egr bufs, and eventually tid bufs + * for all versions of the driver, if they were allocated + */ + for (m = 0; m < infinipath_max; m++) { + uint64_t val; + ipath_devdata *dd = &devdata[m]; + if (dd->ipath_kregbase) { + /* in case unload fails, be consistent */ + dd->ipath_rcvctrl = 0U; + ipath_kput_kreg(m, kr_rcvctrl, dd->ipath_rcvctrl); + + /* + * gracefully stop all sends allowing any in + * progress to trickle out first. + */ + ipath_kput_kreg(m, kr_sendctrl, 0ULL); + val = ipath_kget_kreg64(m, kr_scratch); /* flush it */ + /* + * enough for anything that's going to trickle + * out to have actually done so. + */ + udelay(5); + + /* + * abort any armed or launched PIO buffers that + * didn't go. (self clearing). Will cause any + * packet currently being transmitted to go out + * with an EBP, and may also cause a short packet + * error on the receiver. + */ + ipath_kput_kreg(m, kr_sendctrl, INFINIPATH_S_ABORT); + + /* mask interrupts, but not errors */ + ipath_kput_kreg(m, kr_intmask, 0ULL); + ipath_shutdown_link(m); + + /* + * clear all interrupts and errors. Next time + * driver is loaded, we know that whatever is + * set happened while we were unloaded + */ + ipath_kput_kreg(m, kr_hwerrclear, ~0ULL); + ipath_kput_kreg(m, kr_errorclear, ~0ULL); + ipath_kput_kreg(m, kr_intclear, ~0ULL); + if (dd->__ipath_pioavailregs_base) { + kfree((void *)dd->__ipath_pioavailregs_base); + dd->__ipath_pioavailregs_base = + dd->ipath_pioavailregs_dma = 0; + } + + if (dd->ipath_pageshadow) { + struct page **tmpp = dd->ipath_pageshadow; + int i, cnt = 0; + + _IPATH_VDBG + ("Unlocking any expTID pages still locked\n"); + for (port = 0; port < dd->ipath_cfgports; + port++) { + int port_tidbase = + port * dd->ipath_rcvtidcnt; + int maxtid = + port_tidbase + dd->ipath_rcvtidcnt; + for (i = port_tidbase; i < maxtid; i++) { + if (tmpp[i]) { + ipath_munlock(1, + &tmpp[i]); + tmpp[i] = 0; + cnt++; + } + } + } + if (cnt) { + ipath_stats.sps_pageunlocks += cnt; + _IPATH_VDBG + ("There were still %u expTID entries locked\n", + cnt); + } + if (ipath_stats.sps_pagelocks + || ipath_stats.sps_pageunlocks) + _IPATH_VDBG + ("%llu pages locked, %llu unlocked via ipath_m{un}lock\n", + ipath_stats.sps_pagelocks, + ipath_stats.sps_pageunlocks); + + _IPATH_VDBG + ("Free shadow page tid array at %p\n", + dd->ipath_pageshadow); + vfree(dd->ipath_pageshadow); + dd->ipath_pageshadow = NULL; + } + + /* + * free any resources still in use (usually just + * kernel ports) at unload + */ + for (port = 0; port < dd->ipath_cfgports; port++) + ipath_free_pddata(dd, port, 1); + kfree(dd->ipath_pd); + /* + * debuggability, in case some cleanup path + * tries to use it after this + */ + dd->ipath_pd = NULL; + } + + if (dd->pcidev) { + if (dd->pcidev->irq) { + _IPATH_VDBG("unit %u free_irq of irq %x\n", m, + dd->pcidev->irq); + free_irq(dd->pcidev->irq, dd); + } else + _IPATH_DBG + ("irq is 0, not doing free_irq for unit %u\n", + m); + dd->pcidev = NULL; + } + if (dd->pci_registered) { + _IPATH_VDBG + ("Unregistering pci infrastructure unit %u\n", m); + pci_unregister_driver(&infinipath_driver); + dd->pci_registered = 0; + } else + _IPATH_VDBG + ("unit %u: no pci unreg, wasn't registered\n", m); + ipath_chip_cleanup(dd); /* clean up any per-chip chip-specific stuff */ + } + /* + * clean up any chip-specific stuff for now, only one type of chip + * for any given driver + */ + ipath_chip_done(); + + /* cleanup all our locked pages private data structures */ + ipath_mlock_cleanup(NULL); +} + +/* This is a generic function here, so it can return device-specific + * info. This allows keeping in sync with the version that supports + * multiple chip types. +*/ +void ipath_get_boardname(const ipath_type t, char *name, size_t namelen) +{ + ipath_ht_get_boardname(t, name, namelen); +} + +module_init(infinipath_init); +module_exit(infinipath_cleanup); + +EXPORT_SYMBOL(infinipath_debug); +EXPORT_SYMBOL(ipath_get_boardname); -- 0.99.9n - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/