2011-05-09 20:18:05

by Cliff Wickman

[permalink] [raw]
Subject: [PATCH] x86: UV BAU fix for non-consecutive nasids



From: Cliff Wickman <[email protected]>

This is a fix for the SGI Altix-UV Broadcast Assist Unit code, which is
used for TLB flushing.

This patch allows nasids (numa address space id's) to be non-consecutive, as is
the case when using nasid striding or in some certain hardware configurations.
It also provides a table in local memory for each cpu to translate target cpu
number to target pnode and nasid.

Diffed against 2.6.39-rc6

Signed-off-by: Cliff Wickman <[email protected]>
---
arch/x86/include/asm/uv/uv_bau.h | 17 +++++--
arch/x86/platform/uv/tlb_uv.c | 93 ++++++++++++++++++++++++++-------------
2 files changed, 77 insertions(+), 33 deletions(-)

Index: linux/arch/x86/include/asm/uv/uv_bau.h
===================================================================
--- linux.orig/arch/x86/include/asm/uv/uv_bau.h
+++ linux/arch/x86/include/asm/uv/uv_bau.h
@@ -94,6 +94,8 @@
/* after this # consecutive successes, bump up the throttle if it was lowered */
#define COMPLETE_THRESHOLD 5

+#define UV_LB_SUBNODEID 0x10
+
/*
* number of entries in the destination side payload queue
*/
@@ -124,7 +126,7 @@
* The distribution specification (32 bytes) is interpreted as a 256-bit
* distribution vector. Adjacent bits correspond to consecutive even numbered
* nodeIDs. The result of adding the index of a given bit to the 15-bit
- * 'base_dest_nodeid' field of the header corresponds to the
+ * 'base_dest_nasid' field of the header corresponds to the
* destination nodeID associated with that specified bit.
*/
struct bau_target_uvhubmask {
@@ -176,7 +178,7 @@ struct bau_msg_payload {
struct bau_msg_header {
unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */
/* bits 5:0 */
- unsigned int base_dest_nodeid:15; /* nasid of the */
+ unsigned int base_dest_nasid:15; /* nasid of the */
/* bits 20:6 */ /* first bit in uvhub map */
unsigned int command:8; /* message type */
/* bits 28:21 */
@@ -378,6 +380,10 @@ struct ptc_stats {
unsigned long d_rcanceled; /* number of messages canceled by resets */
};

+struct hub_and_pnode {
+ short uvhub;
+ short pnode;
+};
/*
* one per-cpu; to locate the software tables
*/
@@ -399,10 +405,12 @@ struct bau_control {
int baudisabled;
int set_bau_off;
short cpu;
+ short osnode;
short uvhub_cpu;
short uvhub;
short cpus_in_socket;
short cpus_in_uvhub;
+ short partition_base_pnode;
unsigned short message_number;
unsigned short uvhub_quiesce;
short socket_acknowledge_count[DEST_Q_SIZE];
@@ -422,15 +430,16 @@ struct bau_control {
int congested_period;
cycles_t period_time;
long period_requests;
+ struct hub_and_pnode *target_hub_and_pnode;
};

static inline int bau_uvhub_isset(int uvhub, struct bau_target_uvhubmask *dstp)
{
return constant_test_bit(uvhub, &dstp->bits[0]);
}
-static inline void bau_uvhub_set(int uvhub, struct bau_target_uvhubmask *dstp)
+static inline void bau_uvhub_set(int pnode, struct bau_target_uvhubmask *dstp)
{
- __set_bit(uvhub, &dstp->bits[0]);
+ __set_bit(pnode, &dstp->bits[0]);
}
static inline void bau_uvhubs_clear(struct bau_target_uvhubmask *dstp,
int nbits)
Index: linux/arch/x86/platform/uv/tlb_uv.c
===================================================================
--- linux.orig/arch/x86/platform/uv/tlb_uv.c
+++ linux/arch/x86/platform/uv/tlb_uv.c
@@ -699,16 +699,17 @@ const struct cpumask *uv_flush_tlb_other
struct mm_struct *mm,
unsigned long va, unsigned int cpu)
{
- int tcpu;
- int uvhub;
int locals = 0;
int remotes = 0;
int hubs = 0;
+ int tcpu;
+ int tpnode;
struct bau_desc *bau_desc;
struct cpumask *flush_mask;
struct ptc_stats *stat;
struct bau_control *bcp;
struct bau_control *tbcp;
+ struct hub_and_pnode *hpp;

/* kernel was booted 'nobau' */
if (nobau)
@@ -750,11 +751,18 @@ const struct cpumask *uv_flush_tlb_other
bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu;
bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);

- /* cpu statistics */
for_each_cpu(tcpu, flush_mask) {
- uvhub = uv_cpu_to_blade_id(tcpu);
- bau_uvhub_set(uvhub, &bau_desc->distribution);
- if (uvhub == bcp->uvhub)
+ /*
+ * The distribution vector is a bit map of pnodes, relative
+ * to the partition base pnode (and the partition base nasid
+ * in the header).
+ * Translate cpu to pnode and hub using an array stored
+ * in local memory.
+ */
+ hpp = &bcp->socket_master->target_hub_and_pnode[tcpu];
+ tpnode = hpp->pnode - bcp->partition_base_pnode;
+ bau_uvhub_set(tpnode, &bau_desc->distribution);
+ if (hpp->uvhub == bcp->uvhub)
locals++;
else
remotes++;
@@ -855,7 +863,7 @@ void uv_bau_message_interrupt(struct pt_
* an interrupt, but causes an error message to be returned to
* the sender.
*/
-static void uv_enable_timeouts(void)
+static void __init uv_enable_timeouts(void)
{
int uvhub;
int nuvhubs;
@@ -1326,10 +1334,10 @@ static int __init uv_ptc_init(void)
}

/*
- * initialize the sending side's sending buffers
+ * Initialize the sending side's sending buffers.
*/
static void
-uv_activation_descriptor_init(int node, int pnode)
+uv_activation_descriptor_init(int node, int pnode, int part_base_pnode)
{
int i;
int cpu;
@@ -1352,11 +1360,11 @@ uv_activation_descriptor_init(int node,
n = pa >> uv_nshift;
m = pa & uv_mmask;

+ /* the 14-bit pnode */
uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE,
(n << UV_DESC_BASE_PNODE_SHIFT | m));
-
/*
- * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
+ * Initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
* cpu even though we only use the first one; one descriptor can
* describe a broadcast to 256 uv hubs.
*/
@@ -1365,12 +1373,14 @@ uv_activation_descriptor_init(int node,
memset(bd2, 0, sizeof(struct bau_desc));
bd2->header.sw_ack_flag = 1;
/*
- * base_dest_nodeid is the nasid of the first uvhub
- * in the partition. The bit map will indicate uvhub numbers,
- * which are 0-N in a partition. Pnodes are unique system-wide.
+ * The base_dest_nasid set in the message header is the nasid
+ * of the first uvhub in the partition. The bit map will
+ * indicate destination pnode numbers relative to that base.
+ * They may not be consecutive if nasid striding is being used.
*/
- bd2->header.base_dest_nodeid = UV_PNODE_TO_NASID(uv_partition_base_pnode);
- bd2->header.dest_subnodeid = 0x10; /* the LB */
+ bd2->header.base_dest_nasid =
+ UV_PNODE_TO_NASID(part_base_pnode);
+ bd2->header.dest_subnodeid = UV_LB_SUBNODEID;
bd2->header.command = UV_NET_ENDPOINT_INTD;
bd2->header.int_both = 1;
/*
@@ -1442,7 +1452,7 @@ uv_payload_queue_init(int node, int pnod
/*
* Initialization of each UV hub's structures
*/
-static void __init uv_init_uvhub(int uvhub, int vector)
+static void __init uv_init_uvhub(int uvhub, int vector, int part_base_pnode)
{
int node;
int pnode;
@@ -1450,11 +1460,11 @@ static void __init uv_init_uvhub(int uvh

node = uvhub_to_first_node(uvhub);
pnode = uv_blade_to_pnode(uvhub);
- uv_activation_descriptor_init(node, pnode);
+ uv_activation_descriptor_init(node, pnode, part_base_pnode);
uv_payload_queue_init(node, pnode);
/*
- * the below initialization can't be in firmware because the
- * messaging IRQ will be determined by the OS
+ * The below initialization can't be in firmware because the
+ * messaging IRQ will be determined by the OS.
*/
apicid = uvhub_to_first_apicid(uvhub) | uv_apicid_hibits;
uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
@@ -1491,10 +1501,11 @@ calculate_destination_timeout(void)
/*
* initialize the bau_control structure for each cpu
*/
-static int __init uv_init_per_cpu(int nuvhubs)
+static int __init uv_init_per_cpu(int nuvhubs, int base_part_pnode)
{
int i;
int cpu;
+ int tcpu;
int pnode;
int uvhub;
int have_hmaster;
@@ -1528,6 +1539,15 @@ static int __init uv_init_per_cpu(int nu
bcp = &per_cpu(bau_control, cpu);
memset(bcp, 0, sizeof(struct bau_control));
pnode = uv_cpu_hub_info(cpu)->pnode;
+ if ((pnode - base_part_pnode) >= UV_DISTRIBUTION_SIZE) {
+ printk(KERN_EMERG
+ "cpu %d pnode %d-%d beyond %d; BAU disabled\n",
+ cpu, pnode, base_part_pnode,
+ UV_DISTRIBUTION_SIZE);
+ return 1;
+ }
+ bcp->osnode = cpu_to_node(cpu);
+ bcp->partition_base_pnode = uv_partition_base_pnode;
uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
*(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8));
bdp = &uvhub_descs[uvhub];
@@ -1536,7 +1556,7 @@ static int __init uv_init_per_cpu(int nu
bdp->pnode = pnode;
/* kludge: 'assuming' one node per socket, and assuming that
disabling a socket just leaves a gap in node numbers */
- socket = (cpu_to_node(cpu) & 1);
+ socket = bcp->osnode & 1;
bdp->socket_mask |= (1 << socket);
sdp = &bdp->socket[socket];
sdp->cpu_number[sdp->num_cpus] = cpu;
@@ -1585,6 +1605,20 @@ static int __init uv_init_per_cpu(int nu
nextsocket:
socket++;
socket_mask = (socket_mask >> 1);
+ /* each socket gets a local array of pnodes/hubs */
+ bcp = smaster;
+ bcp->target_hub_and_pnode = kmalloc_node(
+ sizeof(struct hub_and_pnode) *
+ num_possible_cpus(), GFP_KERNEL, bcp->osnode);
+ memset(bcp->target_hub_and_pnode, 0,
+ sizeof(struct hub_and_pnode) *
+ num_possible_cpus());
+ for_each_present_cpu(tcpu) {
+ bcp->target_hub_and_pnode[tcpu].pnode =
+ uv_cpu_hub_info(tcpu)->pnode;
+ bcp->target_hub_and_pnode[tcpu].uvhub =
+ uv_cpu_hub_info(tcpu)->numa_blade_id;
+ }
}
}
kfree(uvhub_descs);
@@ -1637,21 +1671,22 @@ static int __init uv_bau_init(void)
spin_lock_init(&disable_lock);
congested_cycles = microsec_2_cycles(congested_response_us);

- if (uv_init_per_cpu(nuvhubs)) {
- nobau = 1;
- return 0;
- }
-
uv_partition_base_pnode = 0x7fffffff;
- for (uvhub = 0; uvhub < nuvhubs; uvhub++)
+ for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
if (uv_blade_nr_possible_cpus(uvhub) &&
(uv_blade_to_pnode(uvhub) < uv_partition_base_pnode))
uv_partition_base_pnode = uv_blade_to_pnode(uvhub);
+ }
+
+ if (uv_init_per_cpu(nuvhubs, uv_partition_base_pnode)) {
+ nobau = 1;
+ return 0;
+ }

vector = UV_BAU_MESSAGE;
for_each_possible_blade(uvhub)
if (uv_blade_nr_possible_cpus(uvhub))
- uv_init_uvhub(uvhub, vector);
+ uv_init_uvhub(uvhub, vector, uv_partition_base_pnode);

uv_enable_timeouts();
alloc_intr_gate(vector, uv_bau_message_intr1);


2011-05-10 06:27:01

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] x86: UV BAU fix for non-consecutive nasids


* Cliff Wickman <[email protected]> wrote:

> From: Cliff Wickman <[email protected]>
>
> This is a fix for the SGI Altix-UV Broadcast Assist Unit code, which is
> used for TLB flushing.
>
> This patch allows nasids (numa address space id's) to be non-consecutive, as is
> the case when using nasid striding or in some certain hardware configurations.
> It also provides a table in local memory for each cpu to translate target cpu
> number to target pnode and nasid.

This changelog is missing one or more of the following essential pieces
of information:

- what user-visible problem is the patch solving
- how was the problem triggered, how was the bug found
- what/how many systems are affected
- why should maintainers apply this patch right now

Thanks,

Ingo

2011-05-10 06:34:57

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] x86: UV BAU fix for non-consecutive nasids


* Cliff Wickman <[email protected]> wrote:

> @@ -1365,12 +1373,14 @@ uv_activation_descriptor_init(int node,
> memset(bd2, 0, sizeof(struct bau_desc));
> bd2->header.sw_ack_flag = 1;
> /*
> - * base_dest_nodeid is the nasid of the first uvhub
> - * in the partition. The bit map will indicate uvhub numbers,
> - * which are 0-N in a partition. Pnodes are unique system-wide.
> + * The base_dest_nasid set in the message header is the nasid
> + * of the first uvhub in the partition. The bit map will
> + * indicate destination pnode numbers relative to that base.
> + * They may not be consecutive if nasid striding is being used.
> */
> - bd2->header.base_dest_nodeid = UV_PNODE_TO_NASID(uv_partition_base_pnode);
> - bd2->header.dest_subnodeid = 0x10; /* the LB */
> + bd2->header.base_dest_nasid =
> + UV_PNODE_TO_NASID(part_base_pnode);
> + bd2->header.dest_subnodeid = UV_LB_SUBNODEID;
> bd2->header.command = UV_NET_ENDPOINT_INTD;
> bd2->header.int_both = 1;

This code is sick. Illness: too long lines.

checkpatch warned you but you applied the wrong cure and
broke the lines in an ugly way.

Can you think of another cure? One of the many options:

- eliminate many repetitive strings
- reduce indentation by the introduction of helper inlines
- sensible shortening of variable/field/definition names

... applies to this particular case and will solve the problem.

> @@ -1528,6 +1539,15 @@ static int __init uv_init_per_cpu(int nu
> bcp = &per_cpu(bau_control, cpu);
> memset(bcp, 0, sizeof(struct bau_control));
> pnode = uv_cpu_hub_info(cpu)->pnode;
> + if ((pnode - base_part_pnode) >= UV_DISTRIBUTION_SIZE) {
> + printk(KERN_EMERG
> + "cpu %d pnode %d-%d beyond %d; BAU disabled\n",
> + cpu, pnode, base_part_pnode,
> + UV_DISTRIBUTION_SIZE);

See my points above.

> @@ -1585,6 +1605,20 @@ static int __init uv_init_per_cpu(int nu
> nextsocket:
> socket++;
> socket_mask = (socket_mask >> 1);
> + /* each socket gets a local array of pnodes/hubs */
> + bcp = smaster;
> + bcp->target_hub_and_pnode = kmalloc_node(
> + sizeof(struct hub_and_pnode) *
> + num_possible_cpus(), GFP_KERNEL, bcp->osnode);
> + memset(bcp->target_hub_and_pnode, 0,
> + sizeof(struct hub_and_pnode) *
> + num_possible_cpus());
> + for_each_present_cpu(tcpu) {
> + bcp->target_hub_and_pnode[tcpu].pnode =
> + uv_cpu_hub_info(tcpu)->pnode;
> + bcp->target_hub_and_pnode[tcpu].uvhub =
> + uv_cpu_hub_info(tcpu)->numa_blade_id;
> + }

See my points above.

Thanks,

Ingo