2009-03-13 18:34:55

by Jiri Pirko

[permalink] [raw]
Subject: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge

Hi all.

This is only a draft of patch to consult. I'm aware that it should be divided
into multiple patches. I want to know opinion from you folks.

The problem is described in following bugzilla:
https://bugzilla.redhat.com/show_bug.cgi?id=487763

Basically here's what's going on. In every mode, bonding interface uses the same
mac address for all enslaved devices. Except for mode balance-alb. When you put
this kind of bond device into a bridge it will only add one of mac adresses into
a hash list of mac addresses, say X. This mac address is marked as local. But
this bonding interface also has mac address Y. Now then packet arrives with
destination address Y, this address is not marked as local and the packed looks
like it needs to be forwarded. This packet is then lost which is wrong.

Notice that interfaces can be added and removed from bond while it is in bridge.
Therefore I introduce another function pointer in struct net_device_ops -
ndo_check_mac_address. This function when it's implemented should check passed
mac address against the one set in device. I'm using this in bonding driver when
the bond is in mode balance-alb to walk thru all slaves and checking if any of
them equals passed address.

Then in bridge function br_handle_frame_finish() I'm using ndo_check_mac_address
to recognize the destination mac address as local.

Please look at this and tell me what you think about it.

Thanks

Jirka


Signed-off-by: Jiri Pirko <[email protected]>

drivers/net/bonding/bond_alb.c | 17 +++++++++++++++++
drivers/net/bonding/bond_alb.h | 1 +
drivers/net/bonding/bond_main.c | 11 +++++++++++
include/linux/netdevice.h | 7 +++++++
net/bridge/br_input.c | 5 ++++-
5 files changed, 40 insertions(+), 1 deletions(-)

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 27fb7f5..b7bcee0 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -1762,6 +1762,23 @@ int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr)
return 0;
}

+int bond_alb_check_mac_address(struct net_device *bond_dev, void *addr)
+{
+ struct bonding *bond = netdev_priv(bond_dev);
+ struct slave *slave = NULL;
+ int ret = !0;
+ int i;
+
+ read_lock(&bond->lock);
+ bond_for_each_slave(bond, slave, i) {
+ ret = compare_ether_addr(slave->perm_hwaddr, addr);
+ if (!ret)
+ break;
+ }
+ read_unlock(&bond->lock);
+ return ret;
+}
+
void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id)
{
if (bond->alb_info.current_alb_vlan &&
diff --git a/drivers/net/bonding/bond_alb.h b/drivers/net/bonding/bond_alb.h
index 50968f8..5e39bda 100644
--- a/drivers/net/bonding/bond_alb.h
+++ b/drivers/net/bonding/bond_alb.h
@@ -127,6 +127,7 @@ void bond_alb_handle_active_change(struct bonding *bond, struct slave *new_slave
int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev);
void bond_alb_monitor(struct work_struct *);
int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr);
+int bond_alb_check_mac_address(struct net_device *bond_dev, void *addr);
void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id);
#endif /* __BOND_ALB_H__ */

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index e0578fe..fbff338 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4279,6 +4279,16 @@ unwind:
return res;
}

+static int bond_check_mac_address(struct net_device *bond_dev, void *addr)
+{
+ struct bonding *bond = netdev_priv(bond_dev);
+
+ if (bond->params.mode == BOND_MODE_ALB)
+ return bond_alb_check_mac_address(bond_dev, addr);
+
+ return compare_ether_addr(bond_dev->dev_addr, addr);
+}
+
static int bond_xmit_roundrobin(struct sk_buff *skb, struct net_device *bond_dev)
{
struct bonding *bond = netdev_priv(bond_dev);
@@ -4576,6 +4586,7 @@ static const struct net_device_ops bond_netdev_ops = {
.ndo_set_multicast_list = bond_set_multicast_list,
.ndo_change_mtu = bond_change_mtu,
.ndo_set_mac_address = bond_set_mac_address,
+ .ndo_check_mac_address = bond_check_mac_address,
.ndo_neigh_setup = bond_neigh_setup,
.ndo_vlan_rx_register = bond_vlan_rx_register,
.ndo_vlan_rx_add_vid = bond_vlan_rx_add_vid,
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6593667..e75f691 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -491,6 +491,10 @@ struct netdev_queue {
* needs to be changed. If not this interface is not defined, the
* mac address can not be changed.
*
+ * int (*ndo_check_mac_address)(struct net_device *dev, void *addr);
+ * This function is called when the given Media Access Control address
+ * needs to compared to the one set to the device.
+ *
* int (*ndo_validate_addr)(struct net_device *dev);
* Test if Media Access Control address is valid for the device.
*
@@ -554,6 +558,9 @@ struct net_device_ops {
#define HAVE_SET_MAC_ADDR
int (*ndo_set_mac_address)(struct net_device *dev,
void *addr);
+#define HAVE_CHECK_MAC_ADDR
+ int (*ndo_check_mac_address)(struct net_device *dev,
+ void *addr);
#define HAVE_VALIDATE_ADDR
int (*ndo_validate_addr)(struct net_device *dev);
#define HAVE_PRIVATE_IOCTL
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 30b8877..b071169 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -39,6 +39,7 @@ int br_handle_frame_finish(struct sk_buff *skb)
{
const unsigned char *dest = eth_hdr(skb)->h_dest;
struct net_bridge_port *p = rcu_dereference(skb->dev->br_port);
+ struct net_device *dev = p->dev;
struct net_bridge *br;
struct net_bridge_fdb_entry *dst;
struct sk_buff *skb2;
@@ -64,7 +65,9 @@ int br_handle_frame_finish(struct sk_buff *skb)
if (is_multicast_ether_addr(dest)) {
br->dev->stats.multicast++;
skb2 = skb;
- } else if ((dst = __br_fdb_get(br, dest)) && dst->is_local) {
+ } else if (((dst = __br_fdb_get(br, dest)) && dst->is_local) ||
+ (dev->netdev_ops->ndo_check_mac_address &&
+ !dev->netdev_ops->ndo_check_mac_address(dev, (unsigned char *) dest))) {
skb2 = skb;
/* Do not forward the packet since it's local. */
skb = NULL;


2009-03-14 05:41:35

by Stephen Hemminger

[permalink] [raw]
Subject: Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge

On Fri, 13 Mar 2009 19:33:04 +0100
Jiri Pirko <[email protected]> wrote:

> Hi all.
>
> This is only a draft of patch to consult. I'm aware that it should be divided
> into multiple patches. I want to know opinion from you folks.
>
> The problem is described in following bugzilla:
> https://bugzilla.redhat.com/show_bug.cgi?id=487763
>
> Basically here's what's going on. In every mode, bonding interface uses the same
> mac address for all enslaved devices. Except for mode balance-alb. When you put
> this kind of bond device into a bridge it will only add one of mac adresses into
> a hash list of mac addresses, say X. This mac address is marked as local. But
> this bonding interface also has mac address Y. Now then packet arrives with
> destination address Y, this address is not marked as local and the packed looks
> like it needs to be forwarded. This packet is then lost which is wrong.
>
> Notice that interfaces can be added and removed from bond while it is in bridge.
> Therefore I introduce another function pointer in struct net_device_ops -
> ndo_check_mac_address. This function when it's implemented should check passed
> mac address against the one set in device. I'm using this in bonding driver when
> the bond is in mode balance-alb to walk thru all slaves and checking if any of
> them equals passed address.
>
> Then in bridge function br_handle_frame_finish() I'm using ndo_check_mac_address
> to recognize the destination mac address as local.
>
> Please look at this and tell me what you think about it.
>
> Thanks
>
> Jirka
>

A better and more general way to do this have the dev_set_mac_address
function check the return of the notifier and unwind. Then any protocol
can easily prevent address from changing.

2009-03-14 09:51:01

by Jiri Pirko

[permalink] [raw]
Subject: Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge

Sat, Mar 14, 2009 at 06:39:32AM CET, [email protected] wrote:
>On Fri, 13 Mar 2009 19:33:04 +0100
>Jiri Pirko <[email protected]> wrote:
>
>> Hi all.
>>
>> This is only a draft of patch to consult. I'm aware that it should be divided
>> into multiple patches. I want to know opinion from you folks.
>>
>> The problem is described in following bugzilla:
>> https://bugzilla.redhat.com/show_bug.cgi?id=487763
>>
>> Basically here's what's going on. In every mode, bonding interface uses the same
>> mac address for all enslaved devices. Except for mode balance-alb. When you put
>> this kind of bond device into a bridge it will only add one of mac adresses into
>> a hash list of mac addresses, say X. This mac address is marked as local. But
>> this bonding interface also has mac address Y. Now then packet arrives with
>> destination address Y, this address is not marked as local and the packed looks
>> like it needs to be forwarded. This packet is then lost which is wrong.
>>
>> Notice that interfaces can be added and removed from bond while it is in bridge.
>> Therefore I introduce another function pointer in struct net_device_ops -
>> ndo_check_mac_address. This function when it's implemented should check passed
>> mac address against the one set in device. I'm using this in bonding driver when
>> the bond is in mode balance-alb to walk thru all slaves and checking if any of
>> them equals passed address.
>>
>> Then in bridge function br_handle_frame_finish() I'm using ndo_check_mac_address
>> to recognize the destination mac address as local.
>>
>> Please look at this and tell me what you think about it.
>>
>> Thanks
>>
>> Jirka
>>
>
>A better and more general way to do this have the dev_set_mac_address
>function check the return of the notifier and unwind. Then any protocol
>can easily prevent address from changing.

Can you please describe this thougth a bit more? I can't understand it now...

Thanks

Jirka

2009-03-15 23:13:50

by Stephen Hemminger

[permalink] [raw]
Subject: Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge

On Sat, 14 Mar 2009 10:49:11 +0100
Jiri Pirko <[email protected]> wrote:

> Sat, Mar 14, 2009 at 06:39:32AM CET, [email protected] wrote:
> >On Fri, 13 Mar 2009 19:33:04 +0100
> >Jiri Pirko <[email protected]> wrote:
> >
> >> Hi all.
> >>
> >> This is only a draft of patch to consult. I'm aware that it should be divided
> >> into multiple patches. I want to know opinion from you folks.
> >>
> >> The problem is described in following bugzilla:
> >> https://bugzilla.redhat.com/show_bug.cgi?id=487763
> >>
> >> Basically here's what's going on. In every mode, bonding interface uses the same
> >> mac address for all enslaved devices. Except for mode balance-alb. When you put
> >> this kind of bond device into a bridge it will only add one of mac adresses into
> >> a hash list of mac addresses, say X. This mac address is marked as local. But
> >> this bonding interface also has mac address Y. Now then packet arrives with
> >> destination address Y, this address is not marked as local and the packed looks
> >> like it needs to be forwarded. This packet is then lost which is wrong.
> >>
> >> Notice that interfaces can be added and removed from bond while it is in bridge.
> >> Therefore I introduce another function pointer in struct net_device_ops -
> >> ndo_check_mac_address. This function when it's implemented should check passed
> >> mac address against the one set in device. I'm using this in bonding driver when
> >> the bond is in mode balance-alb to walk thru all slaves and checking if any of
> >> them equals passed address.
> >>
> >> Then in bridge function br_handle_frame_finish() I'm using ndo_check_mac_address
> >> to recognize the destination mac address as local.
> >>
> >> Please look at this and tell me what you think about it.
> >>
> >> Thanks
> >>
> >> Jirka
> >>
> >
> >A better and more general way to do this have the dev_set_mac_address
> >function check the return of the notifier and unwind. Then any protocol
> >can easily prevent address from changing.
>
> Can you please describe this thougth a bit more? I can't understand it now...
>
> Thanks
>
> Jirka

Something like this:

--- a/net/core/dev.c 2009-03-15 15:55:02.098126056 -0700
+++ b/net/core/dev.c 2009-03-15 16:02:43.999251305 -0700
@@ -3830,6 +3830,7 @@ int dev_set_mac_address(struct net_devic
{
const struct net_device_ops *ops = dev->netdev_ops;
int err;
+ char save_addr[MAX_ADDR_LEN];

if (!ops->ndo_set_mac_address)
return -EOPNOTSUPP;
@@ -3837,9 +3838,17 @@ int dev_set_mac_address(struct net_devic
return -EINVAL;
if (!netif_device_present(dev))
return -ENODEV;
+
+ memcpy(save_addr, dev->dev_addr, dev->addr_len);
err = ops->ndo_set_mac_address(dev, sa);
- if (!err)
- call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ if (err)
+ return err;
+
+ err = call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ if (err) {
+ memcpy(sa->sa_data, save_addr, dev->addr_len);
+ ops->ndo_set_mac_address(dev, sa);
+ }
return err;
}


And something like this:

--- a/drivers/net/bonding/bond_main.c 2009-03-15 16:03:53.909000973 -0700
+++ b/drivers/net/bonding/bond_main.c 2009-03-15 16:11:43.227127031 -0700
@@ -3534,6 +3534,7 @@ static int bond_slave_netdev_event(unsig
{
struct net_device *bond_dev = slave_dev->master;
struct bonding *bond = netdev_priv(bond_dev);
+ int err;

switch (event) {
case NETDEV_UNREGISTER:
@@ -3570,6 +3571,15 @@ static int bond_slave_netdev_event(unsig
* servitude.
*/
break;
+ case NETDEV_CHANGEADDR:
+ if (bond->params.mode == BOND_MODE_ALB)
+ err = bond_alb_check_mac_address(bond);
+ else if (compare_ether_addr(bond_dev->dev_addr, addr) != 0)
+ err = -EINVAL;
+
+ if (err)
+ return notifier_from_errno(err);
+ break;
case NETDEV_CHANGENAME:
/*
* TODO: handle changing the primary's name


2009-03-16 11:13:20

by Jiri Pirko

[permalink] [raw]
Subject: Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge

Mon, Mar 16, 2009 at 12:12:17AM CET, [email protected] wrote:
>On Sat, 14 Mar 2009 10:49:11 +0100
>Jiri Pirko <[email protected]> wrote:
>
>> Sat, Mar 14, 2009 at 06:39:32AM CET, [email protected] wrote:
>> >On Fri, 13 Mar 2009 19:33:04 +0100
>> >Jiri Pirko <[email protected]> wrote:
>> >
>> >> Hi all.
>> >>
>> >> This is only a draft of patch to consult. I'm aware that it should be divided
>> >> into multiple patches. I want to know opinion from you folks.
>> >>
>> >> The problem is described in following bugzilla:
>> >> https://bugzilla.redhat.com/show_bug.cgi?id=487763
>> >>
>> >> Basically here's what's going on. In every mode, bonding interface uses the same
>> >> mac address for all enslaved devices. Except for mode balance-alb. When you put
>> >> this kind of bond device into a bridge it will only add one of mac adresses into
>> >> a hash list of mac addresses, say X. This mac address is marked as local. But
>> >> this bonding interface also has mac address Y. Now then packet arrives with
>> >> destination address Y, this address is not marked as local and the packed looks
>> >> like it needs to be forwarded. This packet is then lost which is wrong.
>> >>
>> >> Notice that interfaces can be added and removed from bond while it is in bridge.
>> >> Therefore I introduce another function pointer in struct net_device_ops -
>> >> ndo_check_mac_address. This function when it's implemented should check passed
>> >> mac address against the one set in device. I'm using this in bonding driver when
>> >> the bond is in mode balance-alb to walk thru all slaves and checking if any of
>> >> them equals passed address.
>> >>
>> >> Then in bridge function br_handle_frame_finish() I'm using ndo_check_mac_address
>> >> to recognize the destination mac address as local.
>> >>
>> >> Please look at this and tell me what you think about it.
>> >>
>> >> Thanks
>> >>
>> >> Jirka
>> >>
>> >
>> >A better and more general way to do this have the dev_set_mac_address
>> >function check the return of the notifier and unwind. Then any protocol
>> >can easily prevent address from changing.
>>
>> Can you please describe this thougth a bit more? I can't understand it now...
>>
>> Thanks
>>
>> Jirka
>
>Something like this:
>
>--- a/net/core/dev.c 2009-03-15 15:55:02.098126056 -0700
>+++ b/net/core/dev.c 2009-03-15 16:02:43.999251305 -0700
>@@ -3830,6 +3830,7 @@ int dev_set_mac_address(struct net_devic
> {
> const struct net_device_ops *ops = dev->netdev_ops;
> int err;
>+ char save_addr[MAX_ADDR_LEN];
>
> if (!ops->ndo_set_mac_address)
> return -EOPNOTSUPP;
>@@ -3837,9 +3838,17 @@ int dev_set_mac_address(struct net_devic
> return -EINVAL;
> if (!netif_device_present(dev))
> return -ENODEV;
>+
>+ memcpy(save_addr, dev->dev_addr, dev->addr_len);
> err = ops->ndo_set_mac_address(dev, sa);
>- if (!err)
>- call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
>+ if (err)
>+ return err;
>+
>+ err = call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
>+ if (err) {
>+ memcpy(sa->sa_data, save_addr, dev->addr_len);
>+ ops->ndo_set_mac_address(dev, sa);
>+ }
> return err;
> }
>
>
>And something like this:
>
>--- a/drivers/net/bonding/bond_main.c 2009-03-15 16:03:53.909000973 -0700
>+++ b/drivers/net/bonding/bond_main.c 2009-03-15 16:11:43.227127031 -0700
>@@ -3534,6 +3534,7 @@ static int bond_slave_netdev_event(unsig
> {
> struct net_device *bond_dev = slave_dev->master;
> struct bonding *bond = netdev_priv(bond_dev);
>+ int err;
>
> switch (event) {
> case NETDEV_UNREGISTER:
>@@ -3570,6 +3571,15 @@ static int bond_slave_netdev_event(unsig
> * servitude.
> */
> break;
>+ case NETDEV_CHANGEADDR:
>+ if (bond->params.mode == BOND_MODE_ALB)
>+ err = bond_alb_check_mac_address(bond);
>+ else if (compare_ether_addr(bond_dev->dev_addr, addr) != 0)
>+ err = -EINVAL;
>+
>+ if (err)
>+ return notifier_from_errno(err);
>+ break;
> case NETDEV_CHANGENAME:
> /*
> * TODO: handle changing the primary's name
>
Yes, I think the changing mac address of slaves should be also handled by
bonding driver. But my patch fixes a different issue. See, unlike in any other
bonding modes, in balance-alb mode incoming packets have multiple MAC adresses
(of any of enslaved devices). This causes problem because bridge only recognize
one of them (the mac of master which is the mac on one of the slaves) as local -
the other MAC's are not recognized as they are a part of port and therefore
handled as general MAC adresses. This is the problem.

I can see two solutions. Either like my patch or somehow allow bridge to know
more MAC addressses per port (maybe netdev can be changed to know more then
one MAC address).

Any thoughts?

Thanks

Jirka
>
>

2009-03-19 06:20:30

by David Miller

[permalink] [raw]
Subject: Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge

From: Jiri Pirko <[email protected]>
Date: Mon, 16 Mar 2009 12:11:28 +0100

> I can see two solutions. Either like my patch or somehow allow bridge to know
> more MAC addressses per port (maybe netdev can be changed to know more then
> one MAC address).
>
> Any thoughts?

The netdev struct already supports having a list of multiple unicast
MAC addresses, it can probably be used and inspected for this.

I'll hold off on your patch until we make some more progress on
this discussion.

2009-03-19 08:47:28

by Jiri Pirko

[permalink] [raw]
Subject: Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge

Thu, Mar 19, 2009 at 07:20:03AM CET, [email protected] wrote:
>From: Jiri Pirko <[email protected]>
>Date: Mon, 16 Mar 2009 12:11:28 +0100
>
>> I can see two solutions. Either like my patch or somehow allow bridge to know
>> more MAC addressses per port (maybe netdev can be changed to know more then
>> one MAC address).
>>
>> Any thoughts?
>
>The netdev struct already supports having a list of multiple unicast
>MAC addresses, it can probably be used and inspected for this.
Yes I was looking at this thing yesterday (uc_list). But this list serves
to different purpose. Do you think that it will be correct to use it for this? I
would maybe like to make a new list similar to this for our purpose
(say addr_list). I think it would be more correct.

Eventually in the furute we would use this list as a primary place to store
device address instead of dev_addr value and make it more general (as device
generally may have more adresses). Just a thought...

>
>I'll hold off on your patch until we make some more progress on
>this discussion.

2009-03-19 08:50:29

by Patrick McHardy

[permalink] [raw]
Subject: Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge

David Miller wrote:
> From: Jiri Pirko <[email protected]>
> Date: Mon, 16 Mar 2009 12:11:28 +0100
>
>> I can see two solutions. Either like my patch or somehow allow bridge to know
>> more MAC addressses per port (maybe netdev can be changed to know more then
>> one MAC address).
>>
>> Any thoughts?
>
> The netdev struct already supports having a list of multiple unicast
> MAC addresses, it can probably be used and inspected for this.
>
> I'll hold off on your patch until we make some more progress on
> this discussion.

From reading the balance-alb description, I get the impression that this
mode is simply not meant to be used with bridging:

Adaptive load balancing: includes balance-tlb plus
receive load balancing (rlb) for IPV4 traffic, and
does not require any special switch support. The
receive load balancing is achieved by ARP negotiation.
The bonding driver intercepts the ARP Replies sent by
the local system on their way out and overwrites the
source hardware address with the unique hardware
address of one of the slaves in the bond such that
different peers use different hardware addresses for
the server.

In any case I'd tend to say that if bond-alb mode mangles outgoing MAC
addresses, it should restore the original one for received packets
and keep the hacks local to bonding.

2009-03-19 10:22:17

by David Miller

[permalink] [raw]
Subject: Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge

From: Jiri Pirko <[email protected]>
Date: Thu, 19 Mar 2009 09:44:45 +0100

> Yes I was looking at this thing yesterday (uc_list). But this list serves
> to different purpose. Do you think that it will be correct to use it for this? I
> would maybe like to make a new list similar to this for our purpose
> (say addr_list). I think it would be more correct.

Whatever you do with that list privately inside of the bonding
driver should be fine.

It might upset something in the generic code if you don't clean
it up before deregistration of the bonding device, so just be
tidy.

2009-03-19 11:21:20

by Jiri Pirko

[permalink] [raw]
Subject: Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge

Thu, Mar 19, 2009 at 11:21:43AM CET, [email protected] wrote:
>From: Jiri Pirko <[email protected]>
>Date: Thu, 19 Mar 2009 09:44:45 +0100
>
>> Yes I was looking at this thing yesterday (uc_list). But this list serves
>> to different purpose. Do you think that it will be correct to use it for this? I
>> would maybe like to make a new list similar to this for our purpose
>> (say addr_list). I think it would be more correct.
>
>Whatever you do with that list privately inside of the bonding
>driver should be fine.
Well I do not need it only inside the bonding driver. I want bridge to use this
list when adding a device in it and get mac addresses from there into its
hashlist (to recognize these addresses as local).
>
>It might upset something in the generic code if you don't clean
>it up before deregistration of the bonding device, so just be
>tidy.

2009-03-19 16:35:25

by Jiri Pirko

[permalink] [raw]
Subject: Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge

Thu, Mar 19, 2009 at 09:50:03AM CET, [email protected] wrote:
> David Miller wrote:
>> From: Jiri Pirko <[email protected]>
>> Date: Mon, 16 Mar 2009 12:11:28 +0100
>>
>>> I can see two solutions. Either like my patch or somehow allow bridge to know
>>> more MAC addressses per port (maybe netdev can be changed to know more then
>>> one MAC address).
>>>
>>> Any thoughts?
>>
>> The netdev struct already supports having a list of multiple unicast
>> MAC addresses, it can probably be used and inspected for this.
>>
>> I'll hold off on your patch until we make some more progress on
>> this discussion.
>
> From reading the balance-alb description, I get the impression that this
> mode is simply not meant to be used with bridging:
>
> Adaptive load balancing: includes balance-tlb plus
> receive load balancing (rlb) for IPV4 traffic, and
> does not require any special switch support. The
> receive load balancing is achieved by ARP negotiation.
> The bonding driver intercepts the ARP Replies sent by
> the local system on their way out and overwrites the
> source hardware address with the unique hardware
> address of one of the slaves in the bond such that
> different peers use different hardware addresses for
> the server.
>
> In any case I'd tend to say that if bond-alb mode mangles outgoing MAC
> addresses, it should restore the original one for received packets
> and keep the hacks local to bonding.

To let bonding driver to resolve this I think there will be needed some kind of
hook in netif_receive_skb() as for example bridge has. I would rather do this
more general and transparent.

2009-03-24 09:55:14

by Stanichenko Marat

[permalink] [raw]
Subject: Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge

> Thu, Mar 19, 2009 at 11:21:43AM CET, [email protected] wrote:
>>From: Jiri Pirko <[email protected]>
>>Date: Thu, 19 Mar 2009 09:44:45 +0100
>>
>>> Yes I was looking at this thing yesterday (uc_list). But this list serves
>>> to different purpose. Do you think that it will be correct to use it for this? I
>>> would maybe like to make a new list similar to this for our purpose
>>> (say addr_list). I think it would be more correct.
>>
>>Whatever you do with that list privately inside of the bonding
>>driver should be fine.
> Well I do not need it only inside the bonding driver. I want bridge to use this
> list when adding a device in it and get mac addresses from there into its
> hashlist (to recognize these addresses as local).
Please correct me if I understand you improperly. You're going to mark all mac
addresses that belong to slaves as "local" when adding a bond device to the
bridge, aren't you? The only thing I'd like to notice (this might be an obvious
one): a packet that is pushed out from one slave might reach the host through
another slave. Considering all slaves as "local" in bridge code might lead to
numerous messages "received packet with own address as source address".

Please CC me personally when answering this message.

Thanks,
Marat.

2009-03-25 13:07:32

by Jiri Pirko

[permalink] [raw]
Subject: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try2

(resend)

Hi all.

The problem is described in following bugzilla:
https://bugzilla.redhat.com/show_bug.cgi?id=487763

Basically here's what's going on. In every mode, bonding interface uses the same
mac address for all enslaved devices. Except for mode balance-alb. When you put
this kind of bond device into a bridge it will only add one of mac adresses into
a hash list of mac addresses, say X. This mac address is marked as local. But
this bonding interface also has mac address Y. Now then packet arrives with
destination address Y, this address is not marked as local and the packed looks
like it needs to be forwarded. This packet is then lost which is wrong.

Notice that interfaces can be added and removed from bond while it is in bridge.

This patch solves the situation in the bonding without touching bridge code,
as Patrick suggested. For every incoming frame to bonding it searches the
destination address in slaves list and if any of slave addresses matches, it
rewrites the address in frame by the adress of bonding master. This ensures that
all frames comming thru the bonding in alb mode have the same address.

Jirka


Signed-off-by: Jiri Pirko <[email protected]>

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 27fb7f5..2838be0 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -1762,6 +1762,26 @@ int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr)
return 0;
}

+void bond_alb_change_dest(struct sk_buff *skb)
+{
+ struct net_device *bond_dev = skb->dev;
+ struct bonding *bond = netdev_priv(bond_dev);
+ unsigned char *dest = eth_hdr(skb)->h_dest;
+ struct slave *slave;
+ int i;
+
+ if (!memcmp(dest, bond_dev->dev_addr, ETH_ALEN))
+ return;
+ read_lock(&bond->lock);
+ bond_for_each_slave(bond, slave, i) {
+ if (!memcmp(slave->dev->dev_addr, dest, ETH_ALEN)) {
+ memcpy(dest, bond_dev->dev_addr, ETH_ALEN);
+ break;
+ }
+ }
+ read_unlock(&bond->lock);
+}
+
void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id)
{
if (bond->alb_info.current_alb_vlan &&
diff --git a/drivers/net/bonding/bond_alb.h b/drivers/net/bonding/bond_alb.h
index 50968f8..77f36fb 100644
--- a/drivers/net/bonding/bond_alb.h
+++ b/drivers/net/bonding/bond_alb.h
@@ -127,6 +127,7 @@ void bond_alb_handle_active_change(struct bonding *bond, struct slave *new_slave
int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev);
void bond_alb_monitor(struct work_struct *);
int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr);
+void bond_alb_change_dest(struct sk_buff *skb);
void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id);
#endif /* __BOND_ALB_H__ */

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 3d76686..b62fdc4 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4294,6 +4294,19 @@ unwind:
return res;
}

+/*
+ * Called via bond_change_dest_hook.
+ * note: already called with rcu_read_lock (preempt_disabled)
+ */
+void bond_change_dest(struct sk_buff *skb)
+{
+ struct net_device *bond_dev = skb->dev;
+ struct bonding *bond = netdev_priv(bond_dev);
+
+ if (bond->params.mode == BOND_MODE_ALB)
+ bond_alb_change_dest(skb);
+}
+
static int bond_xmit_roundrobin(struct sk_buff *skb, struct net_device *bond_dev)
{
struct bonding *bond = netdev_priv(bond_dev);
@@ -5243,6 +5256,8 @@ static int __init bonding_init(void)
register_inetaddr_notifier(&bond_inetaddr_notifier);
bond_register_ipv6_notifier();

+ bond_change_dest_hook = bond_change_dest;
+
goto out;
err:
list_for_each_entry(bond, &bond_dev_list, bond_list) {
@@ -5266,6 +5281,8 @@ static void __exit bonding_exit(void)
unregister_inetaddr_notifier(&bond_inetaddr_notifier);
bond_unregister_ipv6_notifier();

+ bond_change_dest_hook = NULL;
+
bond_destroy_sysfs();

rtnl_lock();
diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
index ca849d2..df92b70 100644
--- a/drivers/net/bonding/bonding.h
+++ b/drivers/net/bonding/bonding.h
@@ -375,5 +375,7 @@ static inline void bond_unregister_ipv6_notifier(void)
}
#endif

+extern void (*bond_change_dest_hook)(struct sk_buff *skb);
+
#endif /* _LINUX_BONDING_H */

diff --git a/net/core/dev.c b/net/core/dev.c
index e3fe5c7..abe68d9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2061,6 +2061,13 @@ static inline int deliver_skb(struct sk_buff *skb,
return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
}

+#if defined(CONFIG_BONDING) || defined(CONFIG_BONDING_MODULE)
+void (*bond_change_dest_hook)(struct sk_buff *skb) __read_mostly;
+EXPORT_SYMBOL(bond_change_dest_hook);
+#else
+#define bond_change_dest_hook(skb) do {} while (0)
+#endif
+
#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
/* These hooks defined here for ATM */
struct net_bridge;
@@ -2251,10 +2258,12 @@ int netif_receive_skb(struct sk_buff *skb)
null_or_orig = NULL;
orig_dev = skb->dev;
if (orig_dev->master) {
- if (skb_bond_should_drop(skb))
+ if (skb_bond_should_drop(skb)) {
null_or_orig = orig_dev; /* deliver only exact match */
- else
+ } else {
skb->dev = orig_dev->master;
+ bond_change_dest_hook(skb);
+ }
}

__get_cpu_var(netdev_rx_stat).total++;

2009-03-25 13:44:27

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try2

Jiri Pirko a ?crit :
> (resend)
>
> Hi all.
>
> The problem is described in following bugzilla:
> https://bugzilla.redhat.com/show_bug.cgi?id=487763
>
> Basically here's what's going on. In every mode, bonding interface uses the same
> mac address for all enslaved devices. Except for mode balance-alb. When you put
> this kind of bond device into a bridge it will only add one of mac adresses into
> a hash list of mac addresses, say X. This mac address is marked as local. But
> this bonding interface also has mac address Y. Now then packet arrives with
> destination address Y, this address is not marked as local and the packed looks
> like it needs to be forwarded. This packet is then lost which is wrong.
>
> Notice that interfaces can be added and removed from bond while it is in bridge.
>
> This patch solves the situation in the bonding without touching bridge code,
> as Patrick suggested. For every incoming frame to bonding it searches the
> destination address in slaves list and if any of slave addresses matches, it
> rewrites the address in frame by the adress of bonding master. This ensures that
> all frames comming thru the bonding in alb mode have the same address.
>
> Jirka
>
>
> Signed-off-by: Jiri Pirko <[email protected]>
>
> diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
> index 27fb7f5..2838be0 100644
> --- a/drivers/net/bonding/bond_alb.c
> +++ b/drivers/net/bonding/bond_alb.c
> @@ -1762,6 +1762,26 @@ int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr)
> return 0;
> }
>
> +void bond_alb_change_dest(struct sk_buff *skb)
> +{
> + struct net_device *bond_dev = skb->dev;
> + struct bonding *bond = netdev_priv(bond_dev);
> + unsigned char *dest = eth_hdr(skb)->h_dest;
> + struct slave *slave;
> + int i;
> +
> + if (!memcmp(dest, bond_dev->dev_addr, ETH_ALEN))
> + return;
> + read_lock(&bond->lock);


Its a pity bonding doesnt use RCU and needs this read_lock(&bond->lock)


> + bond_for_each_slave(bond, slave, i) {
> + if (!memcmp(slave->dev->dev_addr, dest, ETH_ALEN)) {

compare_ether_addr() (or even better compare_ether_addr_64bits()) instead of memcmp() ?

> + memcpy(dest, bond_dev->dev_addr, ETH_ALEN);
> + break;
> + }
> + }
> + read_unlock(&bond->lock);
> +}
> +

2009-03-25 14:42:05

by Jiri Pirko

[permalink] [raw]
Subject: Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try2

Wed, Mar 25, 2009 at 02:40:43PM CET, [email protected] wrote:
>Jiri Pirko a ?crit :
>> (resend)
>>
>> Hi all.
>>
>> The problem is described in following bugzilla:
>> https://bugzilla.redhat.com/show_bug.cgi?id=487763
>>
>> Basically here's what's going on. In every mode, bonding interface uses the same
>> mac address for all enslaved devices. Except for mode balance-alb. When you put
>> this kind of bond device into a bridge it will only add one of mac adresses into
>> a hash list of mac addresses, say X. This mac address is marked as local. But
>> this bonding interface also has mac address Y. Now then packet arrives with
>> destination address Y, this address is not marked as local and the packed looks
>> like it needs to be forwarded. This packet is then lost which is wrong.
>>
>> Notice that interfaces can be added and removed from bond while it is in bridge.
>>
>> This patch solves the situation in the bonding without touching bridge code,
>> as Patrick suggested. For every incoming frame to bonding it searches the
>> destination address in slaves list and if any of slave addresses matches, it
>> rewrites the address in frame by the adress of bonding master. This ensures that
>> all frames comming thru the bonding in alb mode have the same address.
>>
>> Jirka
>>
>>
>> Signed-off-by: Jiri Pirko <[email protected]>
>>
>> diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
>> index 27fb7f5..2838be0 100644
>> --- a/drivers/net/bonding/bond_alb.c
>> +++ b/drivers/net/bonding/bond_alb.c
>> @@ -1762,6 +1762,26 @@ int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr)
>> return 0;
>> }
>>
>> +void bond_alb_change_dest(struct sk_buff *skb)
>> +{
>> + struct net_device *bond_dev = skb->dev;
>> + struct bonding *bond = netdev_priv(bond_dev);
>> + unsigned char *dest = eth_hdr(skb)->h_dest;
>> + struct slave *slave;
>> + int i;
>> +
>> + if (!memcmp(dest, bond_dev->dev_addr, ETH_ALEN))
>> + return;
>> + read_lock(&bond->lock);
>
>
>Its a pity bonding doesnt use RCU and needs this read_lock(&bond->lock)

Sure it is...
>
>
>> + bond_for_each_slave(bond, slave, i) {
>> + if (!memcmp(slave->dev->dev_addr, dest, ETH_ALEN)) {
>
>compare_ether_addr() (or even better compare_ether_addr_64bits()) instead of memcmp() ?

Okay, I'll use compare_ether_addr_64bits and do the repost later on...
>
>> + memcpy(dest, bond_dev->dev_addr, ETH_ALEN);
>> + break;
>> + }
>> + }
>> + read_unlock(&bond->lock);
>> +}
>> +
>

2009-03-25 15:21:32

by Jiri Pirko

[permalink] [raw]
Subject: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try3

(resend, using compare_ether_addr_64bits instead of memcmp)

Hi all.

The problem is described in following bugzilla:
https://bugzilla.redhat.com/show_bug.cgi?id=487763

Basically here's what's going on. In every mode, bonding interface uses the same
mac address for all enslaved devices. Except for mode balance-alb. When you put
this kind of bond device into a bridge it will only add one of mac adresses into
a hash list of mac addresses, say X. This mac address is marked as local. But
this bonding interface also has mac address Y. Now then packet arrives with
destination address Y, this address is not marked as local and the packed looks
like it needs to be forwarded. This packet is then lost which is wrong.

Notice that interfaces can be added and removed from bond while it is in bridge.

This patch solves the situation in the bonding without touching bridge code,
as Patrick suggested. For every incoming frame to bonding it searches the
destination address in slaves list and if any of slave addresses matches, it
rewrites the address in frame by the adress of bonding master. This ensures that
all frames comming thru the bonding in alb mode have the same address.

Jirka


Signed-off-by: Jiri Pirko <[email protected]>

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 27fb7f5..83998f4 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -1762,6 +1762,26 @@ int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr)
return 0;
}

+void bond_alb_change_dest(struct sk_buff *skb)
+{
+ struct net_device *bond_dev = skb->dev;
+ struct bonding *bond = netdev_priv(bond_dev);
+ unsigned char *dest = eth_hdr(skb)->h_dest;
+ struct slave *slave;
+ int i;
+
+ if (!compare_ether_addr_64bits(dest, bond_dev->dev_addr))
+ return;
+ read_lock(&bond->lock);
+ bond_for_each_slave(bond, slave, i) {
+ if (!compare_ether_addr_64bits(slave->dev->dev_addr, dest)) {
+ memcpy(dest, bond_dev->dev_addr, ETH_ALEN);
+ break;
+ }
+ }
+ read_unlock(&bond->lock);
+}
+
void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id)
{
if (bond->alb_info.current_alb_vlan &&
diff --git a/drivers/net/bonding/bond_alb.h b/drivers/net/bonding/bond_alb.h
index 50968f8..77f36fb 100644
--- a/drivers/net/bonding/bond_alb.h
+++ b/drivers/net/bonding/bond_alb.h
@@ -127,6 +127,7 @@ void bond_alb_handle_active_change(struct bonding *bond, struct slave *new_slave
int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev);
void bond_alb_monitor(struct work_struct *);
int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr);
+void bond_alb_change_dest(struct sk_buff *skb);
void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id);
#endif /* __BOND_ALB_H__ */

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 3d76686..b62fdc4 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4294,6 +4294,19 @@ unwind:
return res;
}

+/*
+ * Called via bond_change_dest_hook.
+ * note: already called with rcu_read_lock (preempt_disabled)
+ */
+void bond_change_dest(struct sk_buff *skb)
+{
+ struct net_device *bond_dev = skb->dev;
+ struct bonding *bond = netdev_priv(bond_dev);
+
+ if (bond->params.mode == BOND_MODE_ALB)
+ bond_alb_change_dest(skb);
+}
+
static int bond_xmit_roundrobin(struct sk_buff *skb, struct net_device *bond_dev)
{
struct bonding *bond = netdev_priv(bond_dev);
@@ -5243,6 +5256,8 @@ static int __init bonding_init(void)
register_inetaddr_notifier(&bond_inetaddr_notifier);
bond_register_ipv6_notifier();

+ bond_change_dest_hook = bond_change_dest;
+
goto out;
err:
list_for_each_entry(bond, &bond_dev_list, bond_list) {
@@ -5266,6 +5281,8 @@ static void __exit bonding_exit(void)
unregister_inetaddr_notifier(&bond_inetaddr_notifier);
bond_unregister_ipv6_notifier();

+ bond_change_dest_hook = NULL;
+
bond_destroy_sysfs();

rtnl_lock();
diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
index ca849d2..df92b70 100644
--- a/drivers/net/bonding/bonding.h
+++ b/drivers/net/bonding/bonding.h
@@ -375,5 +375,7 @@ static inline void bond_unregister_ipv6_notifier(void)
}
#endif

+extern void (*bond_change_dest_hook)(struct sk_buff *skb);
+
#endif /* _LINUX_BONDING_H */

diff --git a/net/core/dev.c b/net/core/dev.c
index e3fe5c7..abe68d9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2061,6 +2061,13 @@ static inline int deliver_skb(struct sk_buff *skb,
return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
}

+#if defined(CONFIG_BONDING) || defined(CONFIG_BONDING_MODULE)
+void (*bond_change_dest_hook)(struct sk_buff *skb) __read_mostly;
+EXPORT_SYMBOL(bond_change_dest_hook);
+#else
+#define bond_change_dest_hook(skb) do {} while (0)
+#endif
+
#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
/* These hooks defined here for ATM */
struct net_bridge;
@@ -2251,10 +2258,12 @@ int netif_receive_skb(struct sk_buff *skb)
null_or_orig = NULL;
orig_dev = skb->dev;
if (orig_dev->master) {
- if (skb_bond_should_drop(skb))
+ if (skb_bond_should_drop(skb)) {
null_or_orig = orig_dev; /* deliver only exact match */
- else
+ } else {
skb->dev = orig_dev->master;
+ bond_change_dest_hook(skb);
+ }
}

__get_cpu_var(netdev_rx_stat).total++;

2009-03-25 16:32:15

by Jay Vosburgh

[permalink] [raw]
Subject: Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try3

Jiri Pirko <[email protected]> wrote:

>Basically here's what's going on. In every mode, bonding interface uses the same
>mac address for all enslaved devices. Except for mode balance-alb.

I think you mean "only balance-alb will simultaneously use
multiple MAC addresses across different slaves." Yes?

I ask because the active-backup mode with fail_over_mac=active
will change the bond's MAC to always be the MAC of whatever the
currently active slave is, but I don't think that will trigger the
problem you're talking about (because it'll only use one MAC at a time).

>[...] When you put
>this kind of bond device into a bridge it will only add one of mac adresses into
>a hash list of mac addresses, say X. This mac address is marked as local. But
>this bonding interface also has mac address Y. Now then packet arrives with
>destination address Y, this address is not marked as local and the packed looks
>like it needs to be forwarded. This packet is then lost which is wrong.
>
>Notice that interfaces can be added and removed from bond while it is in bridge.
>
>This patch solves the situation in the bonding without touching bridge code,
>as Patrick suggested. For every incoming frame to bonding it searches the
>destination address in slaves list and if any of slave addresses matches, it
>rewrites the address in frame by the adress of bonding master. This ensures that
>all frames comming thru the bonding in alb mode have the same address.
>
>Jirka
>
>
>Signed-off-by: Jiri Pirko <[email protected]>
>
>diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
>index 27fb7f5..83998f4 100644
>--- a/drivers/net/bonding/bond_alb.c
>+++ b/drivers/net/bonding/bond_alb.c
>@@ -1762,6 +1762,26 @@ int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr)
> return 0;
> }
>
>+void bond_alb_change_dest(struct sk_buff *skb)
>+{
>+ struct net_device *bond_dev = skb->dev;
>+ struct bonding *bond = netdev_priv(bond_dev);
>+ unsigned char *dest = eth_hdr(skb)->h_dest;
>+ struct slave *slave;
>+ int i;
>+
>+ if (!compare_ether_addr_64bits(dest, bond_dev->dev_addr))
>+ return;
>+ read_lock(&bond->lock);
>+ bond_for_each_slave(bond, slave, i) {
>+ if (!compare_ether_addr_64bits(slave->dev->dev_addr, dest)) {
>+ memcpy(dest, bond_dev->dev_addr, ETH_ALEN);
>+ break;
>+ }
>+ }
>+ read_unlock(&bond->lock);
>+}
>+
> void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id)
> {
> if (bond->alb_info.current_alb_vlan &&
>diff --git a/drivers/net/bonding/bond_alb.h b/drivers/net/bonding/bond_alb.h
>index 50968f8..77f36fb 100644
>--- a/drivers/net/bonding/bond_alb.h
>+++ b/drivers/net/bonding/bond_alb.h
>@@ -127,6 +127,7 @@ void bond_alb_handle_active_change(struct bonding *bond, struct slave *new_slave
> int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev);
> void bond_alb_monitor(struct work_struct *);
> int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr);
>+void bond_alb_change_dest(struct sk_buff *skb);
> void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id);
> #endif /* __BOND_ALB_H__ */
>
>diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
>index 3d76686..b62fdc4 100644
>--- a/drivers/net/bonding/bond_main.c
>+++ b/drivers/net/bonding/bond_main.c
>@@ -4294,6 +4294,19 @@ unwind:
> return res;
> }
>
>+/*
>+ * Called via bond_change_dest_hook.
>+ * note: already called with rcu_read_lock (preempt_disabled)
>+ */
>+void bond_change_dest(struct sk_buff *skb)
>+{
>+ struct net_device *bond_dev = skb->dev;
>+ struct bonding *bond = netdev_priv(bond_dev);
>+
>+ if (bond->params.mode == BOND_MODE_ALB)
>+ bond_alb_change_dest(skb);
>+}
>+
> static int bond_xmit_roundrobin(struct sk_buff *skb, struct net_device *bond_dev)
> {
> struct bonding *bond = netdev_priv(bond_dev);
>@@ -5243,6 +5256,8 @@ static int __init bonding_init(void)
> register_inetaddr_notifier(&bond_inetaddr_notifier);
> bond_register_ipv6_notifier();
>
>+ bond_change_dest_hook = bond_change_dest;
>+
> goto out;
> err:
> list_for_each_entry(bond, &bond_dev_list, bond_list) {
>@@ -5266,6 +5281,8 @@ static void __exit bonding_exit(void)
> unregister_inetaddr_notifier(&bond_inetaddr_notifier);
> bond_unregister_ipv6_notifier();
>
>+ bond_change_dest_hook = NULL;
>+
> bond_destroy_sysfs();
>
> rtnl_lock();
>diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
>index ca849d2..df92b70 100644
>--- a/drivers/net/bonding/bonding.h
>+++ b/drivers/net/bonding/bonding.h
>@@ -375,5 +375,7 @@ static inline void bond_unregister_ipv6_notifier(void)
> }
> #endif
>
>+extern void (*bond_change_dest_hook)(struct sk_buff *skb);
>+
> #endif /* _LINUX_BONDING_H */
>
>diff --git a/net/core/dev.c b/net/core/dev.c
>index e3fe5c7..abe68d9 100644
>--- a/net/core/dev.c
>+++ b/net/core/dev.c
>@@ -2061,6 +2061,13 @@ static inline int deliver_skb(struct sk_buff *skb,
> return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
> }
>
>+#if defined(CONFIG_BONDING) || defined(CONFIG_BONDING_MODULE)
>+void (*bond_change_dest_hook)(struct sk_buff *skb) __read_mostly;
>+EXPORT_SYMBOL(bond_change_dest_hook);
>+#else
>+#define bond_change_dest_hook(skb) do {} while (0)
>+#endif
>+
> #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
> /* These hooks defined here for ATM */
> struct net_bridge;
>@@ -2251,10 +2258,12 @@ int netif_receive_skb(struct sk_buff *skb)
> null_or_orig = NULL;
> orig_dev = skb->dev;
> if (orig_dev->master) {
>- if (skb_bond_should_drop(skb))
>+ if (skb_bond_should_drop(skb)) {
> null_or_orig = orig_dev; /* deliver only exact match */
>- else
>+ } else {
> skb->dev = orig_dev->master;
>+ bond_change_dest_hook(skb);

Since you put the hook outside of the skb_bond_should_drop
function, does the VLAN accelerated receive path do the right thing if,
e.g., there's a VLAN on top of bonding and that VLAN is part of the
bridge?

-J

---
-Jay Vosburgh, IBM Linux Technology Center, [email protected]

2009-03-25 17:47:12

by Jiri Pirko

[permalink] [raw]
Subject: Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try3

Wed, Mar 25, 2009 at 05:31:53PM CET, [email protected] wrote:
>Jiri Pirko <[email protected]> wrote:
>
>>Basically here's what's going on. In every mode, bonding interface uses the same
>>mac address for all enslaved devices. Except for mode balance-alb.
>
> I think you mean "only balance-alb will simultaneously use
>multiple MAC addresses across different slaves." Yes?
Yes I do. I will refolmulate the phrase and repost the patch if you want...
>
> I ask because the active-backup mode with fail_over_mac=active
>will change the bond's MAC to always be the MAC of whatever the
>currently active slave is, but I don't think that will trigger the
>problem you're talking about (because it'll only use one MAC at a time).
>
Yes this fail_over_mac is en exception. In fact I was playing with fail_over_mac
bonding interface in bridge and I have no luck to force a problem with two NICs.
However with 3 NICs I've managed it to the state of 100% packet loss. I'm going
to look at this issue later. This patch is not addressing it...

>>[...] When you put
>>this kind of bond device into a bridge it will only add one of mac adresses into
>>a hash list of mac addresses, say X. This mac address is marked as local. But
>>this bonding interface also has mac address Y. Now then packet arrives with
>>destination address Y, this address is not marked as local and the packed looks
>>like it needs to be forwarded. This packet is then lost which is wrong.
>>
>>Notice that interfaces can be added and removed from bond while it is in bridge.
>>
>>This patch solves the situation in the bonding without touching bridge code,
>>as Patrick suggested. For every incoming frame to bonding it searches the
>>destination address in slaves list and if any of slave addresses matches, it
>>rewrites the address in frame by the adress of bonding master. This ensures that
>>all frames comming thru the bonding in alb mode have the same address.
>>
>>Jirka
>>
>>
>>Signed-off-by: Jiri Pirko <[email protected]>
>>
>>diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
>>index 27fb7f5..83998f4 100644
>>--- a/drivers/net/bonding/bond_alb.c
>>+++ b/drivers/net/bonding/bond_alb.c
>>@@ -1762,6 +1762,26 @@ int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr)
>> return 0;
>> }
>>
>>+void bond_alb_change_dest(struct sk_buff *skb)
>>+{
>>+ struct net_device *bond_dev = skb->dev;
>>+ struct bonding *bond = netdev_priv(bond_dev);
>>+ unsigned char *dest = eth_hdr(skb)->h_dest;
>>+ struct slave *slave;
>>+ int i;
>>+
>>+ if (!compare_ether_addr_64bits(dest, bond_dev->dev_addr))
>>+ return;
>>+ read_lock(&bond->lock);
>>+ bond_for_each_slave(bond, slave, i) {
>>+ if (!compare_ether_addr_64bits(slave->dev->dev_addr, dest)) {
>>+ memcpy(dest, bond_dev->dev_addr, ETH_ALEN);
>>+ break;
>>+ }
>>+ }
>>+ read_unlock(&bond->lock);
>>+}
>>+
>> void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id)
>> {
>> if (bond->alb_info.current_alb_vlan &&
>>diff --git a/drivers/net/bonding/bond_alb.h b/drivers/net/bonding/bond_alb.h
>>index 50968f8..77f36fb 100644
>>--- a/drivers/net/bonding/bond_alb.h
>>+++ b/drivers/net/bonding/bond_alb.h
>>@@ -127,6 +127,7 @@ void bond_alb_handle_active_change(struct bonding *bond, struct slave *new_slave
>> int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev);
>> void bond_alb_monitor(struct work_struct *);
>> int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr);
>>+void bond_alb_change_dest(struct sk_buff *skb);
>> void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id);
>> #endif /* __BOND_ALB_H__ */
>>
>>diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
>>index 3d76686..b62fdc4 100644
>>--- a/drivers/net/bonding/bond_main.c
>>+++ b/drivers/net/bonding/bond_main.c
>>@@ -4294,6 +4294,19 @@ unwind:
>> return res;
>> }
>>
>>+/*
>>+ * Called via bond_change_dest_hook.
>>+ * note: already called with rcu_read_lock (preempt_disabled)
>>+ */
>>+void bond_change_dest(struct sk_buff *skb)
>>+{
>>+ struct net_device *bond_dev = skb->dev;
>>+ struct bonding *bond = netdev_priv(bond_dev);
>>+
>>+ if (bond->params.mode == BOND_MODE_ALB)
>>+ bond_alb_change_dest(skb);
>>+}
>>+
>> static int bond_xmit_roundrobin(struct sk_buff *skb, struct net_device *bond_dev)
>> {
>> struct bonding *bond = netdev_priv(bond_dev);
>>@@ -5243,6 +5256,8 @@ static int __init bonding_init(void)
>> register_inetaddr_notifier(&bond_inetaddr_notifier);
>> bond_register_ipv6_notifier();
>>
>>+ bond_change_dest_hook = bond_change_dest;
>>+
>> goto out;
>> err:
>> list_for_each_entry(bond, &bond_dev_list, bond_list) {
>>@@ -5266,6 +5281,8 @@ static void __exit bonding_exit(void)
>> unregister_inetaddr_notifier(&bond_inetaddr_notifier);
>> bond_unregister_ipv6_notifier();
>>
>>+ bond_change_dest_hook = NULL;
>>+
>> bond_destroy_sysfs();
>>
>> rtnl_lock();
>>diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
>>index ca849d2..df92b70 100644
>>--- a/drivers/net/bonding/bonding.h
>>+++ b/drivers/net/bonding/bonding.h
>>@@ -375,5 +375,7 @@ static inline void bond_unregister_ipv6_notifier(void)
>> }
>> #endif
>>
>>+extern void (*bond_change_dest_hook)(struct sk_buff *skb);
>>+
>> #endif /* _LINUX_BONDING_H */
>>
>>diff --git a/net/core/dev.c b/net/core/dev.c
>>index e3fe5c7..abe68d9 100644
>>--- a/net/core/dev.c
>>+++ b/net/core/dev.c
>>@@ -2061,6 +2061,13 @@ static inline int deliver_skb(struct sk_buff *skb,
>> return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
>> }
>>
>>+#if defined(CONFIG_BONDING) || defined(CONFIG_BONDING_MODULE)
>>+void (*bond_change_dest_hook)(struct sk_buff *skb) __read_mostly;
>>+EXPORT_SYMBOL(bond_change_dest_hook);
>>+#else
>>+#define bond_change_dest_hook(skb) do {} while (0)
>>+#endif
>>+
>> #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
>> /* These hooks defined here for ATM */
>> struct net_bridge;
>>@@ -2251,10 +2258,12 @@ int netif_receive_skb(struct sk_buff *skb)
>> null_or_orig = NULL;
>> orig_dev = skb->dev;
>> if (orig_dev->master) {
>>- if (skb_bond_should_drop(skb))
>>+ if (skb_bond_should_drop(skb)) {
>> null_or_orig = orig_dev; /* deliver only exact match */
>>- else
>>+ } else {
>> skb->dev = orig_dev->master;
>>+ bond_change_dest_hook(skb);
>
> Since you put the hook outside of the skb_bond_should_drop
>function, does the VLAN accelerated receive path do the right thing if,
>e.g., there's a VLAN on top of bonding and that VLAN is part of the
>bridge?
>
> -J
>
>---
> -Jay Vosburgh, IBM Linux Technology Center, [email protected]

2009-03-26 00:24:52

by David Miller

[permalink] [raw]
Subject: Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try3

From: Jiri Pirko <[email protected]>
Date: Wed, 25 Mar 2009 18:44:05 +0100

> Wed, Mar 25, 2009 at 05:31:53PM CET, [email protected] wrote:
> >Jiri Pirko <[email protected]> wrote:
> >
> >>Basically here's what's going on. In every mode, bonding interface uses the same
> >>mac address for all enslaved devices. Except for mode balance-alb.
> >
> > I think you mean "only balance-alb will simultaneously use
> >multiple MAC addresses across different slaves." Yes?
> Yes I do. I will refolmulate the phrase and repost the patch if you want...

I'll let you guys discuss this some more.

It looks like we could have some more tweaks before this patch is
finalized.

2009-03-26 00:34:23

by Jay Vosburgh

[permalink] [raw]
Subject: Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try3

Jiri Pirko <[email protected]> wrote:

>Wed, Mar 25, 2009 at 05:31:53PM CET, [email protected] wrote:
[...]
>>> #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
>>> /* These hooks defined here for ATM */
>>> struct net_bridge;
>>>@@ -2251,10 +2258,12 @@ int netif_receive_skb(struct sk_buff *skb)
>>> null_or_orig = NULL;
>>> orig_dev = skb->dev;
>>> if (orig_dev->master) {
>>>- if (skb_bond_should_drop(skb))
>>>+ if (skb_bond_should_drop(skb)) {
>>> null_or_orig = orig_dev; /* deliver only exact match */
>>>- else
>>>+ } else {
>>> skb->dev = orig_dev->master;
>>>+ bond_change_dest_hook(skb);
>>
>> Since you put the hook outside of the skb_bond_should_drop
>>function, does the VLAN accelerated receive path do the right thing if,
>>e.g., there's a VLAN on top of bonding and that VLAN is part of the
>>bridge?

Jiri: not trying to be pushy, but you didn't address the above
question about the VLAN path, and I just want to make sure that you saw
it (it was at the bottom of a long email, so I fear you may not have
seen it).

-J

---
-Jay Vosburgh, IBM Linux Technology Center, [email protected]

2009-03-26 11:15:19

by Jiri Pirko

[permalink] [raw]
Subject: Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try3

Wed, Mar 25, 2009 at 05:31:53PM CET, [email protected] wrote:
>>@@ -2251,10 +2258,12 @@ int netif_receive_skb(struct sk_buff *skb)
>> null_or_orig = NULL;
>> orig_dev = skb->dev;
>> if (orig_dev->master) {
>>- if (skb_bond_should_drop(skb))
>>+ if (skb_bond_should_drop(skb)) {
>> null_or_orig = orig_dev; /* deliver only exact match */
>>- else
>>+ } else {
>> skb->dev = orig_dev->master;
>>+ bond_change_dest_hook(skb);
>
> Since you put the hook outside of the skb_bond_should_drop
>function, does the VLAN accelerated receive path do the right thing if,
>e.g., there's a VLAN on top of bonding and that VLAN is part of the
>bridge?

Don't worry :) I did not forget about this - just needed a bit time to
investigate...

Yeah, this look's like a problem. In __vlan_hwaccel_rx there is following line:
skb->dev = vlan_group_get_device(grp, vlan_tci & VLAN_VID_MASK);
This rewrites the dev so latter on when netif_receive_skb is called the hook
will be not called (because dev->master will not be set).

Ok I will put the hook inside the skb_bond_should_drop() - it seems like a
correct solution...

Thanks for pointing this out.
>
> -J
>
>---
> -Jay Vosburgh, IBM Linux Technology Center, [email protected]

2009-03-26 15:55:09

by Jiri Pirko

[permalink] [raw]
Subject: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4

(resend, updated changelog, hook moved into skb_bond_should_drop,
skb_bond_should_drop ifdefed)

Hi all.

The problem is described in following bugzilla:
https://bugzilla.redhat.com/show_bug.cgi?id=487763

Basically here's what's going on. In every mode, bonding interface uses the same
mac address for all enslaved devices (except fail_over_mac). Only balance-alb
will simultaneously use multiple MAC addresses across different slaves. When you
put this kind of bond device into a bridge it will only add one of mac adresses
into a hash list of mac addresses, say X. This mac address is marked as local.
But this bonding interface also has mac address Y. Now then packet arrives with
destination address Y, this address is not marked as local and the packed looks
like it needs to be forwarded. This packet is then lost which is wrong.

Notice that interfaces can be added and removed from bond while it is in bridge.

This patch solves the situation in the bonding without touching bridge code,
as Patrick suggested. For every incoming frame to bonding it searches the
destination address in slaves list and if any of slave addresses matches, it
rewrites the address in frame by the adress of bonding master. This ensures that
all frames comming thru the bonding in alb mode have the same address.

Jirka


Signed-off-by: Jiri Pirko <[email protected]>

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 27fb7f5..b973ede 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -1762,6 +1762,25 @@ int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr)
return 0;
}

+void bond_alb_change_dest(struct sk_buff *skb, struct net_device *bond_dev)
+{
+ struct bonding *bond = netdev_priv(bond_dev);
+ unsigned char *dest = eth_hdr(skb)->h_dest;
+ struct slave *slave;
+ int i;
+
+ if (!compare_ether_addr_64bits(dest, bond_dev->dev_addr))
+ return;
+ read_lock(&bond->lock);
+ bond_for_each_slave(bond, slave, i) {
+ if (!compare_ether_addr_64bits(slave->dev->dev_addr, dest)) {
+ memcpy(dest, bond_dev->dev_addr, ETH_ALEN);
+ break;
+ }
+ }
+ read_unlock(&bond->lock);
+}
+
void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id)
{
if (bond->alb_info.current_alb_vlan &&
diff --git a/drivers/net/bonding/bond_alb.h b/drivers/net/bonding/bond_alb.h
index 50968f8..4924dd7 100644
--- a/drivers/net/bonding/bond_alb.h
+++ b/drivers/net/bonding/bond_alb.h
@@ -127,6 +127,7 @@ void bond_alb_handle_active_change(struct bonding *bond, struct slave *new_slave
int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev);
void bond_alb_monitor(struct work_struct *);
int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr);
+void bond_alb_change_dest(struct sk_buff *skb, struct net_device *bond_dev);
void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id);
#endif /* __BOND_ALB_H__ */

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 3d76686..7c7cb81 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4294,6 +4294,18 @@ unwind:
return res;
}

+/*
+ * Called via bond_change_dest_hook.
+ * note: already called with rcu_read_lock (preempt_disabled)
+ */
+void bond_change_dest(struct sk_buff *skb, struct net_device *bond_dev)
+{
+ struct bonding *bond = netdev_priv(bond_dev);
+
+ if (bond->params.mode == BOND_MODE_ALB)
+ bond_alb_change_dest(skb, bond_dev);
+}
+
static int bond_xmit_roundrobin(struct sk_buff *skb, struct net_device *bond_dev)
{
struct bonding *bond = netdev_priv(bond_dev);
@@ -5243,6 +5255,8 @@ static int __init bonding_init(void)
register_inetaddr_notifier(&bond_inetaddr_notifier);
bond_register_ipv6_notifier();

+ bond_change_dest_hook = bond_change_dest;
+
goto out;
err:
list_for_each_entry(bond, &bond_dev_list, bond_list) {
@@ -5266,6 +5280,8 @@ static void __exit bonding_exit(void)
unregister_inetaddr_notifier(&bond_inetaddr_notifier);
bond_unregister_ipv6_notifier();

+ bond_change_dest_hook = NULL;
+
bond_destroy_sysfs();

rtnl_lock();
diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
index ca849d2..7159483 100644
--- a/drivers/net/bonding/bonding.h
+++ b/drivers/net/bonding/bonding.h
@@ -375,5 +375,8 @@ static inline void bond_unregister_ipv6_notifier(void)
}
#endif

+extern void (*bond_change_dest_hook)(struct sk_buff *skb,
+ struct net_device *master);
+
#endif /* _LINUX_BONDING_H */

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6593667..7af6857 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1860,6 +1860,10 @@ static inline void netif_set_gso_max_size(struct net_device *dev,
dev->gso_max_size = size;
}

+#if defined(CONFIG_BONDING) || defined(CONFIG_BONDING_MODULE)
+extern void (*bond_change_dest_hook)(struct sk_buff *skb,
+ struct net_device *master);
+
/* On bonding slaves other than the currently active slave, suppress
* duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
* ARP on active-backup slaves with arp_validate enabled.
@@ -1876,22 +1880,31 @@ static inline int skb_bond_should_drop(struct sk_buff *skb)
if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
skb->protocol == __constant_htons(ETH_P_ARP))
- return 0;
+ goto dont_drop;

if (master->priv_flags & IFF_MASTER_ALB) {
if (skb->pkt_type != PACKET_BROADCAST &&
skb->pkt_type != PACKET_MULTICAST)
- return 0;
+ goto dont_drop;
}
if (master->priv_flags & IFF_MASTER_8023AD &&
skb->protocol == __constant_htons(ETH_P_SLOW))
- return 0;
+ goto dont_drop;

return 1;
}
+dont_drop:
+ bond_change_dest_hook(skb, master);
}
+
+ return 0;
+}
+#else
+static inline int skb_bond_should_drop(struct sk_buff *skb)
+{
return 0;
}
+#endif

extern struct pernet_operations __net_initdata loopback_net_ops;
#endif /* __KERNEL__ */
diff --git a/net/core/dev.c b/net/core/dev.c
index e3fe5c7..d9b758b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2061,6 +2061,12 @@ static inline int deliver_skb(struct sk_buff *skb,
return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
}

+#if defined(CONFIG_BONDING) || defined(CONFIG_BONDING_MODULE)
+void (*bond_change_dest_hook)(struct sk_buff *skb,
+ struct net_device *master) __read_mostly;
+EXPORT_SYMBOL(bond_change_dest_hook);
+#endif
+
#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
/* These hooks defined here for ATM */
struct net_bridge;

2009-03-27 07:38:45

by David Miller

[permalink] [raw]
Subject: Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4

From: Jiri Pirko <[email protected]>
Date: Thu, 26 Mar 2009 16:52:06 +0100

> (resend, updated changelog, hook moved into skb_bond_should_drop,
> skb_bond_should_drop ifdefed)
>
> Hi all.
>
> The problem is described in following bugzilla:
> https://bugzilla.redhat.com/show_bug.cgi?id=487763
...
> This patch solves the situation in the bonding without touching bridge code,
> as Patrick suggested. For every incoming frame to bonding it searches the
> destination address in slaves list and if any of slave addresses matches, it
> rewrites the address in frame by the adress of bonding master. This ensures that
> all frames comming thru the bonding in alb mode have the same address.
>
> Signed-off-by: Jiri Pirko <[email protected]>


I don't like the hook, but if that's how it's best done....

Patrick, please review this.

2009-03-27 07:49:43

by Jiri Pirko

[permalink] [raw]
Subject: Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4

Fri, Mar 27, 2009 at 08:38:19AM CET, [email protected] wrote:
>From: Jiri Pirko <[email protected]>
>Date: Thu, 26 Mar 2009 16:52:06 +0100
>
>> (resend, updated changelog, hook moved into skb_bond_should_drop,
>> skb_bond_should_drop ifdefed)
>>
>> Hi all.
>>
>> The problem is described in following bugzilla:
>> https://bugzilla.redhat.com/show_bug.cgi?id=487763
> ...
>> This patch solves the situation in the bonding without touching bridge code,
>> as Patrick suggested. For every incoming frame to bonding it searches the
>> destination address in slaves list and if any of slave addresses matches, it
>> rewrites the address in frame by the adress of bonding master. This ensures that
>> all frames comming thru the bonding in alb mode have the same address.
>>
>> Signed-off-by: Jiri Pirko <[email protected]>
>
>
>I don't like the hook, but if that's how it's best done....

Yes I agree with you, but I thing that for now it's the best way to do this. I
picked this solution out of 3 that I had in mind and this is the lesser evil :)
If anyone have any other solution please speak up.

>
>Patrick, please review this.

2009-03-27 07:55:22

by Patrick McHardy

[permalink] [raw]
Subject: Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4

David Miller wrote:
> From: Jiri Pirko <[email protected]>
> Date: Thu, 26 Mar 2009 16:52:06 +0100
>
>> (resend, updated changelog, hook moved into skb_bond_should_drop,
>> skb_bond_should_drop ifdefed)
>>
>> Hi all.
>>
>> The problem is described in following bugzilla:
>> https://bugzilla.redhat.com/show_bug.cgi?id=487763
> ...
>> This patch solves the situation in the bonding without touching bridge code,
>> as Patrick suggested. For every incoming frame to bonding it searches the
>> destination address in slaves list and if any of slave addresses matches, it
>> rewrites the address in frame by the adress of bonding master. This ensures that
>> all frames comming thru the bonding in alb mode have the same address.
>>
>> Signed-off-by: Jiri Pirko <[email protected]>
>
>
> I don't like the hook, but if that's how it's best done....
>
> Patrick, please review this.

Me neither, but I don't think this approach can be done without the
hook. While I still find it questionable whether this mode really
needs to be supported for a bridge at all, an alternative approach
would be to have bonding add FDB entries for all secondary MACs to
make bridging treat them as local.

2009-03-27 08:43:37

by Jiri Pirko

[permalink] [raw]
Subject: Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4

Fri, Mar 27, 2009 at 08:53:13AM CET, [email protected] wrote:
> David Miller wrote:
>> From: Jiri Pirko <[email protected]>
>> Date: Thu, 26 Mar 2009 16:52:06 +0100
>>
>>> (resend, updated changelog, hook moved into skb_bond_should_drop,
>>> skb_bond_should_drop ifdefed)
>>>
>>> Hi all.
>>>
>>> The problem is described in following bugzilla:
>>> https://bugzilla.redhat.com/show_bug.cgi?id=487763
>> ...
>>> This patch solves the situation in the bonding without touching bridge code,
>>> as Patrick suggested. For every incoming frame to bonding it searches the
>>> destination address in slaves list and if any of slave addresses matches, it
>>> rewrites the address in frame by the adress of bonding master. This ensures that
>>> all frames comming thru the bonding in alb mode have the same address.
>>>
>>> Signed-off-by: Jiri Pirko <[email protected]>
>>
>>
>> I don't like the hook, but if that's how it's best done....
>>
>> Patrick, please review this.
>
> Me neither, but I don't think this approach can be done without the
> hook. While I still find it questionable whether this mode really
> needs to be supported for a bridge at all

Well there is I think nothing unusual in this net scheme. And by for example
the increasing setups with kvm/bridging it will be needed more and more.

> , an alternative approach
> would be to have bonding add FDB entries for all secondary MACs to
> make bridging treat them as local.

Yes - that is the clear way. But there's not really straihtforward way to do
this. The clear approach would be to extend struct net_device for list of these
mac addresses and let the drivers (binding) fill it and bridge to process it.
But I don't know.

2009-03-27 08:56:12

by Patrick McHardy

[permalink] [raw]
Subject: Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4

Jiri Pirko wrote:
> Fri, Mar 27, 2009 at 08:53:13AM CET, [email protected] wrote:
>> >
>> > Me neither, but I don't think this approach can be done without the
>> > hook. While I still find it questionable whether this mode really
>> > needs to be supported for a bridge at all
>
> Well there is I think nothing unusual in this net scheme. And by for example
> the increasing setups with kvm/bridging it will be needed more and more.

Mangling ARP packets for load-balancing purposes seems quite unusual.

>> , an alternative approach
>> would be to have bonding add FDB entries for all secondary MACs to
>> make bridging treat them as local.
>
> Yes - that is the clear way. But there's not really straihtforward way to do
> this. The clear approach would be to extend struct net_device for list of these
> mac addresses and let the drivers (binding) fill it and bridge to process it.
> But I don't know.

We have a list of secondary unicast addresses, but that might not
be suitable in this case since the addresses are (mostly) intended
not to be visible to the stack if I understood correctly.

2009-03-27 09:50:37

by Jiri Pirko

[permalink] [raw]
Subject: Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4

Fri, Mar 27, 2009 at 09:55:39AM CET, [email protected] wrote:
> Jiri Pirko wrote:
>> Fri, Mar 27, 2009 at 08:53:13AM CET, [email protected] wrote:
>>> >
>>> > Me neither, but I don't think this approach can be done without the
>>> > hook. While I still find it questionable whether this mode really
>>> > needs to be supported for a bridge at all
>>
>> Well there is I think nothing unusual in this net scheme. And by for example
>> the increasing setups with kvm/bridging it will be needed more and more.
>
> Mangling ARP packets for load-balancing purposes seems quite unusual.

Well, there are many unusual things, that do not imply that they should not be
supported...

>>> , an alternative approach
>>> would be to have bonding add FDB entries for all secondary MACs to
>>> make bridging treat them as local.
>>
>> Yes - that is the clear way. But there's not really straihtforward way to do
>> this. The clear approach would be to extend struct net_device for list of these
>> mac addresses and let the drivers (binding) fill it and bridge to process it.
>> But I don't know.
>
> We have a list of secondary unicast addresses, but that might not
> be suitable in this case since the addresses are (mostly) intended
> not to be visible to the stack if I understood correctly.

I agree this list is not suitable for this - it's used for different purpose and
I think it would be not wise to mix it with what we want...

2009-03-29 20:54:26

by David Miller

[permalink] [raw]
Subject: Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4

From: Patrick McHardy <[email protected]>
Date: Fri, 27 Mar 2009 08:53:13 +0100

> David Miller wrote:
> > I don't like the hook, but if that's how it's best done....
> > Patrick, please review this.
>
> Me neither, but I don't think this approach can be done without the
> hook. While I still find it questionable whether this mode really
> needs to be supported for a bridge at all, an alternative approach
> would be to have bonding add FDB entries for all secondary MACs to
> make bridging treat them as local.

Do you guys foresee any possibility of an alternative implementation
any time soon?

Otherwise we're just stalling by not putting something into the tree,
and as far as I can tell this patch here might as well be it.

2009-03-30 12:04:42

by Patrick McHardy

[permalink] [raw]
Subject: Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4

David Miller wrote:
> From: Patrick McHardy <[email protected]>
> Date: Fri, 27 Mar 2009 08:53:13 +0100
>
>> ... an alternative approach
>> would be to have bonding add FDB entries for all secondary MACs to
>> make bridging treat them as local.
>
> Do you guys foresee any possibility of an alternative implementation
> any time soon?
>
> Otherwise we're just stalling by not putting something into the tree,
> and as far as I can tell this patch here might as well be it.

Adding bridge FDB entries seems like the best fix. It might
need some minor ugliness to avoid new dependencies between
bonding and bridging, but it definitely beats having new hooks
in the core in my opinion.

But I have no idea whether Jiri is actually implementing this.

2009-03-30 12:43:46

by Jiri Pirko

[permalink] [raw]
Subject: Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4

Mon, Mar 30, 2009 at 02:04:25PM CEST, [email protected] wrote:
> David Miller wrote:
>> From: Patrick McHardy <[email protected]>
>> Date: Fri, 27 Mar 2009 08:53:13 +0100
>>
>>> ... an alternative approach
>>> would be to have bonding add FDB entries for all secondary MACs to
>>> make bridging treat them as local.
>>
>> Do you guys foresee any possibility of an alternative implementation
>> any time soon?
>>
>> Otherwise we're just stalling by not putting something into the tree,
>> and as far as I can tell this patch here might as well be it.
>
> Adding bridge FDB entries seems like the best fix. It might
> need some minor ugliness to avoid new dependencies between
> bonding and bridging, but it definitely beats having new hooks
> in the core in my opinion.

Agree with this.
>
> But I have no idea whether Jiri is actually implementing this.

Currently I'm thinking the way. What I have on mind:
I would like to add a list into struct net_device to contain all mac addresses
of the device. I would also like to use similar interface to handle them as
currently is for uc_list and mc_list. However I do not like that these lists are
not using standard list_head but they are propriate lists only for this purpose.
I'm thinking about converting them to use list_head first. Or maybe ignore them
and do the new list for macs in parallel?

Then we can fill this list with macs in bonding driver and let bridge check it
and make fdb entries.

2009-03-30 12:48:21

by Patrick McHardy

[permalink] [raw]
Subject: Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4

Jiri Pirko wrote:
> Currently I'm thinking the way. What I have on mind:
> I would like to add a list into struct net_device to contain all mac addresses
> of the device. I would also like to use similar interface to handle them as
> currently is for uc_list and mc_list. However I do not like that these lists are
> not using standard list_head but they are propriate lists only for this purpose.
> I'm thinking about converting them to use list_head first. Or maybe ignore them
> and do the new list for macs in parallel?

Using list_heads in the address lists would require some pretty large
amount of work since you'd need to convert all the drivers. I'm all
in favour of doing this, but I wouldn't make the fix depend on that
work.

2009-03-30 12:54:51

by Jiri Pirko

[permalink] [raw]
Subject: Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4

Mon, Mar 30, 2009 at 02:47:59PM CEST, [email protected] wrote:
> Jiri Pirko wrote:
>> Currently I'm thinking the way. What I have on mind:
>> I would like to add a list into struct net_device to contain all mac addresses
>> of the device. I would also like to use similar interface to handle them as
>> currently is for uc_list and mc_list. However I do not like that these lists are
>> not using standard list_head but they are propriate lists only for this purpose.
>> I'm thinking about converting them to use list_head first. Or maybe ignore them
>> and do the new list for macs in parallel?
>
> Using list_heads in the address lists would require some pretty large
> amount of work since you'd need to convert all the drivers.

Yes, I'm aware of it...
> I'm all
> in favour of doing this, but I wouldn't make the fix depend on that
> work.

ok so you are suggesting to use the current list struct?
>

2009-03-30 12:58:29

by Patrick McHardy

[permalink] [raw]
Subject: Re: [PATCH] bonding: allow bond in mode balance-alb to work properly in bridge -try4

Jiri Pirko wrote:
> Mon, Mar 30, 2009 at 02:47:59PM CEST, [email protected] wrote:
>> Jiri Pirko wrote:
>>> Currently I'm thinking the way. What I have on mind:
>>> I would like to add a list into struct net_device to contain all mac addresses
>>> of the device. I would also like to use similar interface to handle them as
>>> currently is for uc_list and mc_list. However I do not like that these lists are
>>> not using standard list_head but they are propriate lists only for this purpose.
>>> I'm thinking about converting them to use list_head first. Or maybe ignore them
>>> and do the new list for macs in parallel?
>> Using list_heads in the address lists would require some pretty large
>> amount of work since you'd need to convert all the drivers.
>
> Yes, I'm aware of it...
>> I'm all
>> in favour of doing this, but I wouldn't make the fix depend on that
>> work.
>
> ok so you are suggesting to use the current list struct?

Whatever will make this easier :) You could of course already add the
new structure and use it for your new list and do the conversion of
the existing structures on top of that.

2009-04-13 08:39:24

by Jiri Pirko

[permalink] [raw]
Subject: [PATCH 0/4] bonding: allow bond in mode balance-alb to work properly in bridge -try5

(resend, updated changelog, completely reworked)

Hi all.

The problem is described in following bugzilla:
https://bugzilla.redhat.com/show_bug.cgi?id=487763

Basically here's what's going on. In every mode, bonding interface uses the same
mac address for all enslaved devices (except fail_over_mac). Only balance-alb
will simultaneously use multiple MAC addresses across different slaves. When you
put this kind of bond device into a bridge it will only add one of mac adresses
into a hash list of mac addresses, say X. This mac address is marked as local.
But this bonding interface also has mac address Y. Now then packet arrives with
destination address Y, this address is not marked as local and the packed looks
like it needs to be forwarded. This packet is then lost which is wrong.

Notice that interfaces can be added and removed from bond while it is in bridge.

This patchset solves this issue in the best way it can be possibly solved. By
adding all mac addresses of all slave devices to the bridge hash list. To carry
these addresses the new list has to be introduced in struct net_device.

Jirka

2009-04-13 08:40:33

by Jiri Pirko

[permalink] [raw]
Subject: [PATCH 1/4] net: introduce dev_mac_address_changed

Introducing function dev_mac_address_changed which can be called from driver
which changed his mac address to force notifiers to be called.

Signed-off-by: Jiri Pirko <[email protected]>
---
include/linux/netdevice.h | 1 +
net/core/dev.c | 12 ++++++++++++
2 files changed, 13 insertions(+), 0 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2e7783f..ff8db51 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1461,6 +1461,7 @@ extern int dev_change_net_namespace(struct net_device *,
extern int dev_set_mtu(struct net_device *, int);
extern int dev_set_mac_address(struct net_device *,
struct sockaddr *);
+extern void dev_mac_address_changed(struct net_device *);
extern int dev_hard_start_xmit(struct sk_buff *skb,
struct net_device *dev,
struct netdev_queue *txq);
diff --git a/net/core/dev.c b/net/core/dev.c
index 91d792d..1adc89b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3833,6 +3833,18 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
return err;
}

+/**
+ * dev_mac_address_changed - Notify Media Access Control Address changed
+ * @dev: device
+ *
+ * Notifies the change of the hardware (MAC) address of the device
+ */
+void dev_mac_address_changed(struct net_device *dev)
+{
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+}
+EXPORT_SYMBOL(dev_mac_address_changed);
+
/*
* Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
*/
--
1.6.0.6

2009-04-13 08:43:47

by Jiri Pirko

[permalink] [raw]
Subject: [PATCH 2/4] net: introduce a list of device addresses dev_addr_list

This patch introduces a new list in struct net_device and brings a set of
functions to handle the work with device address list. The list is a replacement
for the original dev_addr field and because in some situations there is need to
carry several device addresses with the net device. To be backward compatible,
dev_addr is made to point to the first member of the list so original drivers
sees no difference.

Signed-off-by: Jiri Pirko <[email protected]>
---
include/linux/netdevice.h | 51 +++++++++-
net/core/dev.c | 264 +++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 313 insertions(+), 2 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ff8db51..8cf62f1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -210,6 +210,12 @@ struct dev_addr_list
#define dmi_users da_users
#define dmi_gusers da_gusers

+struct hw_addr {
+ struct list_head list;
+ unsigned char addr[MAX_ADDR_LEN];
+ int refcount;
+};
+
struct hh_cache
{
struct hh_cache *hh_next; /* Next entry */
@@ -776,8 +782,12 @@ struct net_device
*/
unsigned long last_rx; /* Time of last Rx */
/* Interface address info used in eth_type_trans() */
- unsigned char dev_addr[MAX_ADDR_LEN]; /* hw address, (before bcast
- because most packets are unicast) */
+ unsigned char *dev_addr; /* hw address, (before bcast
+ because most packets are
+ unicast) */
+
+ struct list_head dev_addr_list; /* list of device hw addresses */
+ spinlock_t dev_addr_list_lock;

unsigned char broadcast[MAX_ADDR_LEN]; /* hw bcast add */

@@ -1779,6 +1789,32 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
spin_unlock_bh(&dev->addr_list_lock);
}

+/* Locking helpers for spinlock guarding dev_addr_list */
+
+static inline void netif_dev_addr_lock(struct net_device *dev)
+{
+ spin_lock(&dev->dev_addr_list_lock);
+}
+
+static inline void netif_dev_addr_lock_bh(struct net_device *dev)
+{
+ spin_lock_bh(&dev->dev_addr_list_lock);
+}
+
+static inline void netif_dev_addr_unlock(struct net_device *dev)
+{
+ spin_unlock(&dev->dev_addr_list_lock);
+}
+
+static inline void netif_dev_addr_unlock_bh(struct net_device *dev)
+{
+ spin_unlock_bh(&dev->dev_addr_list_lock);
+}
+
+/* dev_addr_list walker */
+#define for_each_dev_addr(dev, ha) \
+ list_for_each_entry(ha, &dev->dev_addr_list, list)
+
/* These functions live elsewhere (drivers/net/net_init.c, but related) */

extern void ether_setup(struct net_device *dev);
@@ -1791,6 +1827,17 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
alloc_netdev_mq(sizeof_priv, name, setup, 1)
extern int register_netdev(struct net_device *dev);
extern void unregister_netdev(struct net_device *dev);
+
+/* Functions used for device addresses handling */
+extern int dev_addr_add(struct net_device *dev,
+ unsigned char *addr);
+extern int dev_addr_del(struct net_device *dev,
+ unsigned char *addr);
+extern int dev_addr_add_multiple(struct net_device *to_dev,
+ struct net_device *from_dev);
+extern int dev_addr_del_multiple(struct net_device *to_dev,
+ struct net_device *from_dev);
+
/* Functions used for secondary unicast and multicast support */
extern void dev_set_rx_mode(struct net_device *dev);
extern void __dev_set_rx_mode(struct net_device *dev);
diff --git a/net/core/dev.c b/net/core/dev.c
index 1adc89b..0b154b3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3437,6 +3437,263 @@ void dev_set_rx_mode(struct net_device *dev)
netif_addr_unlock_bh(dev);
}

+/* hw addresses list handling functions */
+
+static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
+ int addr_len, int ignore_index)
+{
+ struct hw_addr *ha;
+ int i = 0;
+
+ if (addr_len > MAX_ADDR_LEN)
+ return -EINVAL;
+
+ list_for_each_entry(ha, list, list) {
+ if (i++ != ignore_index &&
+ !memcmp(ha->addr, addr, addr_len)) {
+ ha->refcount++;
+ return 0;
+ }
+ }
+
+ ha = kzalloc(sizeof(*ha), GFP_ATOMIC);
+ if (!ha)
+ return -ENOMEM;
+ memcpy(ha->addr, addr, addr_len);
+ ha->refcount = 1;
+ list_add_tail(&ha->list, list);
+ return 0;
+}
+
+static inline int __hw_addr_add(struct list_head *list, unsigned char *addr,
+ int addr_len)
+{
+ return __hw_addr_add_ii(list, addr, addr_len, -1);
+}
+
+static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
+ int addr_len, int ignore_index)
+{
+ struct hw_addr *ha;
+ int i = 0;
+
+ list_for_each_entry(ha, list, list) {
+ if (i++ != ignore_index &&
+ !memcmp(ha->addr, addr, addr_len)) {
+ if (--ha->refcount)
+ return 0;
+ list_del(&ha->list);
+ kfree(ha);
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+static inline int __hw_addr_del(struct list_head *list, unsigned char *addr,
+ int addr_len)
+{
+ return __hw_addr_del_ii(list, addr, addr_len, -1);
+}
+
+static int __hw_addr_add_multiple_ii(struct list_head *to_list,
+ struct list_head *from_list,
+ int addr_len, int ignore_index)
+{
+ int err;
+ struct hw_addr *ha, *ha2;
+
+ list_for_each_entry(ha, from_list, list) {
+ err = __hw_addr_add_ii(to_list, ha->addr, addr_len, 0);
+ if (err)
+ goto unroll;
+ }
+ return 0;
+unroll:
+ list_for_each_entry(ha2, from_list, list) {
+ if (ha2 == ha)
+ break;
+ __hw_addr_del_ii(to_list, ha2->addr, addr_len, 0);
+ }
+ return err;
+}
+
+static inline int __hw_addr_add_multiple(struct list_head *to_list,
+ struct list_head *from_list,
+ int addr_len)
+{
+ return __hw_addr_add_multiple_ii(to_list, from_list, addr_len, -1);
+}
+
+static void __hw_addr_del_multiple_ii(struct list_head *to_list,
+ struct list_head *from_list,
+ int addr_len, int ignore_index)
+{
+ struct hw_addr *ha;
+
+ list_for_each_entry(ha, from_list, list) {
+ __hw_addr_del_ii(to_list, ha->addr, addr_len, 0);
+ }
+}
+
+static inline void __hw_addr_del_multiple(struct list_head *to_list,
+ struct list_head *from_list,
+ int addr_len)
+{
+ __hw_addr_del_multiple_ii(to_list, from_list, addr_len, -1);
+}
+
+static void __hw_addr_flush(struct list_head *list)
+{
+ struct hw_addr *ha, *tmp;
+
+ list_for_each_entry_safe(ha, tmp, list, list) {
+ list_del(&ha->list);
+ }
+}
+
+/* Device addresses handling functions */
+
+static void dev_addr_flush(struct net_device *dev)
+{
+ netif_dev_addr_lock_bh(dev);
+ __hw_addr_flush(&dev->dev_addr_list);
+ dev->dev_addr = NULL;
+ netif_dev_addr_unlock_bh(dev);
+}
+
+static int dev_addr_init(struct net_device *dev)
+{
+ unsigned char addr[MAX_ADDR_LEN];
+ struct hw_addr *ha;
+ int err;
+
+ spin_lock_init(&dev->dev_addr_list_lock);
+ INIT_LIST_HEAD(&dev->dev_addr_list);
+ memset(addr, 0, sizeof(*addr));
+ netif_dev_addr_lock_bh(dev);
+ err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
+ if (!err) {
+ /*
+ * Get the first (previously created) address from the list
+ * and set dev_addr pointer to this location.
+ */
+ ha = list_first_entry(&dev->dev_addr_list,
+ struct hw_addr, list);
+ dev->dev_addr = ha->addr;
+ }
+ netif_dev_addr_unlock_bh(dev);
+ return err;
+}
+
+/**
+ * dev_addr_add - Add a device address
+ * @dev: device
+ * @addr: address to add
+ *
+ * Add a device address to the device or increase the reference count if
+ * it already exists.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add(struct net_device *dev, unsigned char *addr)
+{
+ int err;
+
+ ASSERT_RTNL();
+
+ netif_dev_addr_lock_bh(dev);
+ err = __hw_addr_add_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
+ netif_dev_addr_unlock_bh(dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_addr_add);
+
+/**
+ * dev_addr_del - Release a device address.
+ * @dev: device
+ * @addr: address to delete
+ *
+ * Release reference to a device address and remove it from the device
+ * if the reference count drops to zero.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del(struct net_device *dev, unsigned char *addr)
+{
+ int err;
+
+ ASSERT_RTNL();
+
+ netif_dev_addr_lock_bh(dev);
+ err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
+ netif_dev_addr_unlock_bh(dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_addr_del);
+
+/**
+ * dev_addr_add_multiple - Add device addresses from another device
+ * @to_dev: device to which addresses will be added
+ * @from_dev: device from which addresses will be added
+ *
+ * Add device addresses of the one device to another.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add_multiple(struct net_device *to_dev,
+ struct net_device *from_dev)
+{
+ int err;
+
+ ASSERT_RTNL();
+
+ if (from_dev->addr_len != to_dev->addr_len)
+ return -EINVAL;
+
+ netif_dev_addr_lock_bh(from_dev);
+ netif_dev_addr_lock_bh(to_dev);
+ err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+ &from_dev->dev_addr_list,
+ to_dev->addr_len, 0);
+ netif_dev_addr_unlock_bh(to_dev);
+ netif_dev_addr_unlock_bh(from_dev);
+
+ return err;
+}
+EXPORT_SYMBOL(dev_addr_add_multiple);
+
+/**
+ * dev_addr_del_multiple - Delete device addresses by another device
+ * @to_dev: device where the addresses will be deleted
+ * @from_dev: device by which addresses the addresses will be deleted
+ *
+ * Deletes addresses in to device by the list of addresses in from device.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del_multiple(struct net_device *to_dev,
+ struct net_device *from_dev)
+{
+ ASSERT_RTNL();
+
+ if (from_dev->addr_len != to_dev->addr_len)
+ return -EINVAL;
+
+ netif_dev_addr_lock_bh(from_dev);
+ netif_dev_addr_lock_bh(to_dev);
+ __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+ &from_dev->dev_addr_list,
+ to_dev->addr_len, 0);
+ netif_dev_addr_unlock_bh(to_dev);
+ netif_dev_addr_unlock_bh(from_dev);
+
+ return 0;
+}
+EXPORT_SYMBOL(dev_addr_del_multiple);
+
+/* unicast and multicast addresses handling functions */
+
int __dev_addr_delete(struct dev_addr_list **list, int *count,
void *addr, int alen, int glbl)
{
@@ -4269,6 +4526,9 @@ static void rollback_registered(struct net_device *dev)
*/
dev_addr_discard(dev);

+ /* Flush device addresses */
+ dev_addr_flush(dev);
+
if (dev->netdev_ops->ndo_uninit)
dev->netdev_ops->ndo_uninit(dev);

@@ -4791,6 +5051,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,

dev->gso_max_size = GSO_MAX_SIZE;

+ dev_addr_init(dev);
netdev_init_queues(dev);

INIT_LIST_HEAD(&dev->napi_list);
@@ -4977,6 +5238,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
*/
dev_addr_discard(dev);

+ /* Flush device addresses */
+ dev_addr_flush(dev);
+
netdev_unregister_kobject(dev);

/* Actually switch the network namespace */
--
1.6.0.6

2009-04-13 08:46:52

by Jiri Pirko

[permalink] [raw]
Subject: [PATCH 3/4] net: bridge: use device address list instead of dev_addr

This patch changes the handling of mac addresses of bridge port devices. Now
it uses previously introduced list of device addresses. It allows the bridge to
know more then one local mac address per port which is mandatory for the right
work in some cases.

Signed-off-by: Jiri Pirko <[email protected]>
---
net/bridge/br_fdb.c | 120 +++++++++++++++++++++++++++++++++--------------
net/bridge/br_if.c | 2 +-
net/bridge/br_notify.c | 2 +-
net/bridge/br_private.h | 4 +-
4 files changed, 89 insertions(+), 39 deletions(-)

diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index a48f5ef..6efc556 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -77,10 +77,45 @@ static inline void fdb_delete(struct net_bridge_fdb_entry *f)
br_fdb_put(f);
}

-void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
+/*
+ * Finds out if passed address is one of the addresses assigned to the device.
+ * Returns 1 on positive result
+ */
+static inline int is_dev_addr(struct net_device *dev, unsigned char *addr)
+{
+ struct hw_addr *ha;
+ int ret = 1;
+
+ netif_dev_addr_lock_bh(dev);
+ for_each_dev_addr(dev, ha) {
+ ret = compare_ether_addr(addr, ha->addr);
+ if (!ret)
+ break;
+ }
+ netif_dev_addr_unlock_bh(dev);
+ return !ret ? 1 : 0;
+}
+
+static int another_port_has_addr(const struct net_bridge_port *p,
+ struct net_bridge_fdb_entry *f)
+{
+ struct net_bridge *br = p->br;
+ struct net_bridge_port *op;
+
+ list_for_each_entry(op, &br->port_list, list) {
+ if (op != p && is_dev_addr(op->dev, f->addr.addr)) {
+ f->dst = op;
+ return 1;
+ }
+ }
+ return 0;
+}
+
+void br_fdb_changeaddr(struct net_bridge_port *p, struct net_device *dev)
{
struct net_bridge *br = p->br;
int i;
+ struct hw_addr *ha;

spin_lock_bh(&br->hash_lock);

@@ -92,26 +127,23 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)

f = hlist_entry(h, struct net_bridge_fdb_entry, hlist);
if (f->dst == p && f->is_local) {
- /* maybe another port has same hw addr? */
- struct net_bridge_port *op;
- list_for_each_entry(op, &br->port_list, list) {
- if (op != p &&
- !compare_ether_addr(op->dev->dev_addr,
- f->addr.addr)) {
- f->dst = op;
- goto insert;
- }
- }
-
- /* delete old one */
- fdb_delete(f);
- goto insert;
+ /*
+ * maybe another port has same hw addr?,
+ * if not then delete it
+ */
+ if (!another_port_has_addr(p, f))
+ fdb_delete(f);
}
}
}
- insert:
- /* insert new address, may fail if invalid address or dup. */
- fdb_insert(br, p, newaddr);
+
+ /* insert device addresses, may fail if invalid address. */
+
+ netif_dev_addr_lock_bh(dev);
+ for_each_dev_addr(dev, ha) {
+ fdb_insert(br, p, ha->addr);
+ }
+ netif_dev_addr_unlock_bh(dev);

spin_unlock_bh(&br->hash_lock);
}
@@ -189,20 +221,9 @@ void br_fdb_delete_by_port(struct net_bridge *br,
* then when one port is deleted, assign
* the local entry to other port
*/
- if (f->is_local) {
- struct net_bridge_port *op;
- list_for_each_entry(op, &br->port_list, list) {
- if (op != p &&
- !compare_ether_addr(op->dev->dev_addr,
- f->addr.addr)) {
- f->dst = op;
- goto skip_delete;
- }
- }
- }
-
- fdb_delete(f);
- skip_delete: ;
+ if (!f->is_local ||
+ !another_port_has_addr(p, f))
+ fdb_delete(f);
}
}
spin_unlock_bh(&br->hash_lock);
@@ -338,7 +359,7 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
}

static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
- const unsigned char *addr)
+ const unsigned char *addr)
{
struct hlist_head *head = &br->hash[br_mac_hash(addr)];
struct net_bridge_fdb_entry *fdb;
@@ -366,13 +387,42 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
return 0;
}

+static int fdb_insert_dev(struct net_bridge *br, struct net_bridge_port *source,
+ struct net_device *dev)
+{
+ struct hw_addr *ha, *ha2;
+ struct net_bridge_fdb_entry *fdb;
+ struct hlist_head *head;
+ int ret = 0;
+
+ netif_dev_addr_lock_bh(dev);
+ for_each_dev_addr(dev, ha) {
+ ret = fdb_insert(br, source, ha->addr);
+ if (ret)
+ goto unroll;
+ }
+ goto unlock;
+unroll:
+ for_each_dev_addr(dev, ha2) {
+ if (ha2 == ha)
+ break;
+ head = &br->hash[br_mac_hash(ha2->addr)];
+ fdb = fdb_find(head, ha2->addr);
+ if (fdb && fdb->is_local)
+ fdb_delete(fdb);
+ }
+unlock:
+ netif_dev_addr_unlock_bh(dev);
+ return ret;
+}
+
int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
- const unsigned char *addr)
+ struct net_device *dev)
{
int ret;

spin_lock_bh(&br->hash_lock);
- ret = fdb_insert(br, source, addr);
+ ret = fdb_insert_dev(br, source, dev);
spin_unlock_bh(&br->hash_lock);
return ret;
}
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 8a96672..789cb30 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -392,7 +392,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
if (err)
goto err0;

- err = br_fdb_insert(br, p, dev->dev_addr);
+ err = br_fdb_insert(br, p, dev);
if (err)
goto err1;

diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c
index 763a3ec..1423541 100644
--- a/net/bridge/br_notify.c
+++ b/net/bridge/br_notify.c
@@ -48,7 +48,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v

case NETDEV_CHANGEADDR:
spin_lock_bh(&br->lock);
- br_fdb_changeaddr(p, dev->dev_addr);
+ br_fdb_changeaddr(p, dev);
br_stp_recalculate_bridge_id(br);
spin_unlock_bh(&br->lock);
break;
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index b6c3b71..65ffe3d 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -148,7 +148,7 @@ extern int br_fdb_init(void);
extern void br_fdb_fini(void);
extern void br_fdb_flush(struct net_bridge *br);
extern void br_fdb_changeaddr(struct net_bridge_port *p,
- const unsigned char *newaddr);
+ struct net_device *dev);
extern void br_fdb_cleanup(unsigned long arg);
extern void br_fdb_delete_by_port(struct net_bridge *br,
const struct net_bridge_port *p, int do_all);
@@ -161,7 +161,7 @@ extern int br_fdb_fillbuf(struct net_bridge *br, void *buf,
unsigned long count, unsigned long off);
extern int br_fdb_insert(struct net_bridge *br,
struct net_bridge_port *source,
- const unsigned char *addr);
+ struct net_device *dev);
extern void br_fdb_update(struct net_bridge *br,
struct net_bridge_port *source,
const unsigned char *addr);
--
1.6.0.6

2009-04-13 08:48:57

by Jiri Pirko

[permalink] [raw]
Subject: [PATCH 4/4] net: bonding: add slave device addresses in mode alb

When in mode alb, add all device addresses which belong to an enslaved slave
device to the bond device. This ensures that all mac addresses will be
treated as local and bonding in this mode will work fine in bridge.

Signed-off-by: Jiri Pirko <[email protected]>
---
drivers/net/bonding/bond_main.c | 30 +++++++++++++++++++++++++++++-
1 files changed, 29 insertions(+), 1 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 99610f3..47795c7 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1385,6 +1385,11 @@ static void bond_setup_by_slave(struct net_device *bond_dev,
bond->setup_by_slave = 1;
}

+static inline int should_copy_dev_addrs(struct bonding *bond)
+{
+ return bond->params.mode == BOND_MODE_ALB ? 1 : 0;
+}
+
/* enslave device <slave> to bond device <master> */
int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
{
@@ -1510,6 +1515,13 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
*/
new_slave->original_flags = slave_dev->flags;

+ if (should_copy_dev_addrs(bond)) {
+ res = dev_addr_add_multiple(bond_dev, slave_dev);
+ if (res)
+ goto err_free;
+ dev_mac_address_changed(bond_dev);
+ }
+
/*
* Save slave's original ("permanent") mac address for modes
* that need it, and for restoring it upon release, and then
@@ -1527,7 +1539,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
res = dev_set_mac_address(slave_dev, &addr);
if (res) {
pr_debug("Error %d calling set_mac_address\n", res);
- goto err_free;
+ goto err_remove_dev_addrs;
}
}

@@ -1769,6 +1781,12 @@ err_restore_mac:
dev_set_mac_address(slave_dev, &addr);
}

+err_remove_dev_addrs:
+ if (should_copy_dev_addrs(bond)) {
+ dev_addr_del_multiple(bond_dev, slave_dev);
+ dev_mac_address_changed(bond_dev);
+ }
+
err_free:
kfree(new_slave);

@@ -1954,6 +1972,11 @@ int bond_release(struct net_device *bond_dev, struct net_device *slave_dev)
/* close slave before restoring its mac address */
dev_close(slave_dev);

+ if (should_copy_dev_addrs(bond)) {
+ dev_addr_del_multiple(bond_dev, slave_dev);
+ dev_mac_address_changed(bond_dev);
+ }
+
if (bond->params.fail_over_mac != BOND_FOM_ACTIVE) {
/* restore original ("permanent") mac address */
memcpy(addr.sa_data, slave->perm_hwaddr, ETH_ALEN);
@@ -2090,6 +2113,9 @@ static int bond_release_all(struct net_device *bond_dev)
/* close slave before restoring its mac address */
dev_close(slave_dev);

+ if (should_copy_dev_addrs(bond))
+ dev_addr_del_multiple(bond_dev, slave_dev);
+
if (!bond->params.fail_over_mac) {
/* restore original ("permanent") mac address*/
memcpy(addr.sa_data, slave->perm_hwaddr, ETH_ALEN);
@@ -2106,6 +2132,8 @@ static int bond_release_all(struct net_device *bond_dev)
write_lock_bh(&bond->lock);
}

+ dev_mac_address_changed(bond_dev);
+
/* zero the mac address of the master so it will be
* set by the application to the mac address of the
* first slave
--
1.6.0.6

2009-04-13 14:51:31

by Stephen Hemminger

[permalink] [raw]
Subject: Re: [PATCH 2/4] net: introduce a list of device addresses dev_addr_list

On Mon, 13 Apr 2009 10:42:02 +0200
Jiri Pirko <[email protected]> wrote:

> This patch introduces a new list in struct net_device and brings a set of
> functions to handle the work with device address list. The list is a replacement
> for the original dev_addr field and because in some situations there is need to
> carry several device addresses with the net device. To be backward compatible,
> dev_addr is made to point to the first member of the list so original drivers
> sees no difference.
>
> Signed-off-by: Jiri Pirko <[email protected]>
> ---
> include/linux/netdevice.h | 51 +++++++++-
> net/core/dev.c | 264 +++++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 313 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index ff8db51..8cf62f1 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -210,6 +210,12 @@ struct dev_addr_list
> #define dmi_users da_users
> #define dmi_gusers da_gusers
>
> +struct hw_addr {
> + struct list_head list;
> + unsigned char addr[MAX_ADDR_LEN];
> + int refcount;
> +};
> +
> struct hh_cache
> {
> struct hh_cache *hh_next; /* Next entry */
> @@ -776,8 +782,12 @@ struct net_device
> */
> unsigned long last_rx; /* Time of last Rx */
> /* Interface address info used in eth_type_trans() */
> - unsigned char dev_addr[MAX_ADDR_LEN]; /* hw address, (before bcast
> - because most packets are unicast) */
> + unsigned char *dev_addr; /* hw address, (before bcast
> + because most packets are
> + unicast) */
> +
> + struct list_head dev_addr_list; /* list of device hw addresses */
> + spinlock_t dev_addr_list_lock;
>
> unsigned char broadcast[MAX_ADDR_LEN]; /* hw bcast add */
>
> @@ -1779,6 +1789,32 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
> spin_unlock_bh(&dev->addr_list_lock);
> }
>
> +/* Locking helpers for spinlock guarding dev_addr_list */
> +
> +static inline void netif_dev_addr_lock(struct net_device *dev)
> +{
> + spin_lock(&dev->dev_addr_list_lock);
> +}
> +
> +static inline void netif_dev_addr_lock_bh(struct net_device *dev)
> +{
> + spin_lock_bh(&dev->dev_addr_list_lock);
> +}
> +
> +static inline void netif_dev_addr_unlock(struct net_device *dev)
> +{
> + spin_unlock(&dev->dev_addr_list_lock);
> +}
> +
> +static inline void netif_dev_addr_unlock_bh(struct net_device *dev)
> +{
> + spin_unlock_bh(&dev->dev_addr_list_lock);
> +}
> +

This lock is unnecessary, use RCU list for read.
Since all changes are under RTNL mutex, there is no chance
for conflict on update.

2009-04-13 14:54:24

by Stephen Hemminger

[permalink] [raw]
Subject: Re: [PATCH 3/4] net: bridge: use device address list instead of dev_addr

On Mon, 13 Apr 2009 10:44:08 +0200
Jiri Pirko <[email protected]> wrote:

> This patch changes the handling of mac addresses of bridge port devices. Now
> it uses previously introduced list of device addresses. It allows the bridge to
> know more then one local mac address per port which is mandatory for the right
> work in some cases.
>
> Signed-off-by: Jiri Pirko <[email protected]>
> ---
> net/bridge/br_fdb.c | 120 +++++++++++++++++++++++++++++++++--------------
> net/bridge/br_if.c | 2 +-
> net/bridge/br_notify.c | 2 +-
> net/bridge/br_private.h | 4 +-
> 4 files changed, 89 insertions(+), 39 deletions(-)
>
> diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
> index a48f5ef..6efc556 100644
> --- a/net/bridge/br_fdb.c
> +++ b/net/bridge/br_fdb.c
> @@ -77,10 +77,45 @@ static inline void fdb_delete(struct net_bridge_fdb_entry *f)
> br_fdb_put(f);
> }
>
> -void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
> +/*
> + * Finds out if passed address is one of the addresses assigned to the device.
> + * Returns 1 on positive result
> + */
> +static inline int is_dev_addr(struct net_device *dev, unsigned char *addr)

Why not a general version in net_device.h or etherdevice.h?

static inline bool is_etherdev_addr(const struct net_device *dev, const unsigned char addr[ETH_ALEN])

> +{
> + struct hw_addr *ha;
> + int ret = 1;
> +
> + netif_dev_addr_lock_bh(dev);
> + for_each_dev_addr(dev, ha) {
User RCU

> + ret = compare_ether_addr(addr, ha->addr);
> + if (!ret)
> + break;
> + }
> + netif_dev_addr_unlock_bh(dev);
> + return !ret ? 1 : 0;
> +}
> +
> +static int another_port_has_addr(const struct net_bridge_port *p,
> + struct net_bridge_fdb_entry *f)
> +{
> + struct net_bridge *br = p->br;
> + struct net_bridge_port *op;
> +
> + list_for_each_entry(op, &br->port_list, list) {
> + if (op != p && is_dev_addr(op->dev, f->addr.addr)) {
> + f->dst = op;
> + return 1;
> + }
> + }
> + return 0;
> +}

Forwarding database is hot path, people sometimes run lots of devices
on single bridge, doesn't this scale worse?

> +void br_fdb_changeaddr(struct net_bridge_port *p, struct net_device *dev)
> {
> struct net_bridge *br = p->br;
> int i;
> + struct hw_addr *ha;
>
> spin_lock_bh(&br->hash_lock);
>
> @@ -92,26 +127,23 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
>
> f = hlist_entry(h, struct net_bridge_fdb_entry, hlist);
> if (f->dst == p && f->is_local) {
> - /* maybe another port has same hw addr? */
> - struct net_bridge_port *op;
> - list_for_each_entry(op, &br->port_list, list) {
> - if (op != p &&
> - !compare_ether_addr(op->dev->dev_addr,
> - f->addr.addr)) {
> - f->dst = op;
> - goto insert;
> - }
> - }
> -
> - /* delete old one */
> - fdb_delete(f);
> - goto insert;
> + /*
> + * maybe another port has same hw addr?,
> + * if not then delete it
> + */
> + if (!another_port_has_addr(p, f))
> + fdb_delete(f);
> }
> }
> }
> - insert:
> - /* insert new address, may fail if invalid address or dup. */
> - fdb_insert(br, p, newaddr);
> +
> + /* insert device addresses, may fail if invalid address. */
> +
> + netif_dev_addr_lock_bh(dev);
> + for_each_dev_addr(dev, ha) {
> + fdb_insert(br, p, ha->addr);
> + }
> + netif_dev_addr_unlock_bh(dev);
>

You added another layer of locking on the already hot bridge
fast path.

2009-04-13 14:57:04

by Stephen Hemminger

[permalink] [raw]
Subject: Re: [PATCH 4/4] net: bonding: add slave device addresses in mode alb

On Mon, 13 Apr 2009 10:46:15 +0200
Jiri Pirko <[email protected]> wrote:

> When in mode alb, add all device addresses which belong to an enslaved slave
> device to the bond device. This ensures that all mac addresses will be
> treated as local and bonding in this mode will work fine in bridge.
>
> Signed-off-by: Jiri Pirko <[email protected]>
> ---
> drivers/net/bonding/bond_main.c | 30 +++++++++++++++++++++++++++++-
> 1 files changed, 29 insertions(+), 1 deletions(-)
>
> diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
> index 99610f3..47795c7 100644
> --- a/drivers/net/bonding/bond_main.c
> +++ b/drivers/net/bonding/bond_main.c
> @@ -1385,6 +1385,11 @@ static void bond_setup_by_slave(struct net_device *bond_dev,
> bond->setup_by_slave = 1;
> }
>
> +static inline int should_copy_dev_addrs(struct bonding *bond)
> +{
> + return bond->params.mode == BOND_MODE_ALB ? 1 : 0;
> +}

static inline bool should_copy_dev_addrs(const struct bonding *bond)
{
return (bond->params.mode == BOND_MODE_ALB);
}

Three things are wrong with your style here:
1. Needless use of tri-graph operator, just return the result
2. Use const for test_foo() type functions
3. Use bool to make it clearer what the result is.

> /* enslave device <slave> to bond device <master> */
> int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
> {
> @@ -1510,6 +1515,13 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
> */
> new_slave->original_flags = slave_dev->flags;
>
> + if (should_copy_dev_addrs(bond)) {
> + res = dev_addr_add_multiple(bond_dev, slave_dev);
> + if (res)
> + goto err_free;
> + dev_mac_address_changed(bond_dev);

The notifier (dev_mac_address_changed) should be part of dev_addr_add

2009-04-13 15:00:03

by Stephen Hemminger

[permalink] [raw]
Subject: Re: [PATCH 1/4] net: introduce dev_mac_address_changed

On Mon, 13 Apr 2009 10:38:48 +0200
Jiri Pirko <[email protected]> wrote:

> Introducing function dev_mac_address_changed which can be called from driver
> which changed his mac address to force notifiers to be called.
>
> Signed-off-by: Jiri Pirko <[email protected]>
> ---
> include/linux/netdevice.h | 1 +
> net/core/dev.c | 12 ++++++++++++
> 2 files changed, 13 insertions(+), 0 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 2e7783f..ff8db51 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -1461,6 +1461,7 @@ extern int dev_change_net_namespace(struct net_device *,
> extern int dev_set_mtu(struct net_device *, int);
> extern int dev_set_mac_address(struct net_device *,
> struct sockaddr *);
> +extern void dev_mac_address_changed(struct net_device *);
> extern int dev_hard_start_xmit(struct sk_buff *skb,
> struct net_device *dev,
> struct netdev_queue *txq);
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 91d792d..1adc89b 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3833,6 +3833,18 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
> return err;
> }
>
> +/**
> + * dev_mac_address_changed - Notify Media Access Control Address changed
> + * @dev: device
> + *
> + * Notifies the change of the hardware (MAC) address of the device
> + */
> +void dev_mac_address_changed(struct net_device *dev)
> +{
> + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
> +}
> +EXPORT_SYMBOL(dev_mac_address_changed);
> +
> /*
> * Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
> */

The original version of this that I send, allowed notifiers to return
an error to block changing address (error would go back to application).
This is how other notifier hooks work (mtu, etc).

Why is dev_set_mac_address_changed called out separately, it should
be inside dev_set_mac_address.

2009-04-13 22:53:57

by David Miller

[permalink] [raw]
Subject: Re: [PATCH 2/4] net: introduce a list of device addresses dev_addr_list

From: Jiri Pirko <[email protected]>
Date: Mon, 13 Apr 2009 10:42:02 +0200

> @@ -210,6 +210,12 @@ struct dev_addr_list
> #define dmi_users da_users
> #define dmi_gusers da_gusers
>
> +struct hw_addr {
> + struct list_head list;
> + unsigned char addr[MAX_ADDR_LEN];
> + int refcount;
> +};
> +

Please don't pollute the global namespace with a structure name
like this. Use "netdev_hw_addr" or "net_hw_addr".

> +static inline int __hw_addr_add(struct list_head *list, unsigned char *addr,
> + int addr_len)

Please let the compiler inline things as it sees fit. These
aren't routines in some header file or anything like that.

2009-04-13 22:54:30

by David Miller

[permalink] [raw]
Subject: Re: [PATCH 3/4] net: bridge: use device address list instead of dev_addr

From: Jiri Pirko <[email protected]>
Date: Mon, 13 Apr 2009 10:44:08 +0200

> diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
> index a48f5ef..6efc556 100644
> --- a/net/bridge/br_fdb.c
> +++ b/net/bridge/br_fdb.c
...
> +static inline int is_dev_addr(struct net_device *dev, unsigned char *addr)
> +{

Please drop the inline, let the compiler work it out.

2009-04-13 22:54:58

by David Miller

[permalink] [raw]
Subject: Re: [PATCH 2/4] net: introduce a list of device addresses dev_addr_list

From: Stephen Hemminger <[email protected]>
Date: Mon, 13 Apr 2009 07:49:17 -0700

> This lock is unnecessary, use RCU list for read.
> Since all changes are under RTNL mutex, there is no chance
> for conflict on update.

Agreed.

2009-04-14 10:17:24

by Jiri Pirko

[permalink] [raw]
Subject: Re: [PATCH 3/4] net: bridge: use device address list instead of dev_addr

Mon, Apr 13, 2009 at 04:54:00PM CEST, [email protected] wrote:
>> +static int another_port_has_addr(const struct net_bridge_port *p,
>> + struct net_bridge_fdb_entry *f)
>> +{
>> + struct net_bridge *br = p->br;
>> + struct net_bridge_port *op;
>> +
>> + list_for_each_entry(op, &br->port_list, list) {
>> + if (op != p && is_dev_addr(op->dev, f->addr.addr)) {
>> + f->dst = op;
>> + return 1;
>> + }
>> + }
>> + return 0;
>> +}
>
>Forwarding database is hot path, people sometimes run lots of devices
>on single bridge, doesn't this scale worse?
>
This only puts the original loop code to the function, so if compiler decides to
inline this there might be no difference.

2009-04-15 08:19:15

by Jiri Pirko

[permalink] [raw]
Subject: [PATCH 0/3] bonding: allow bond in mode balance-alb to work properly in bridge -try6

(resend, rcu list locking, cometics)

Hi all.

The problem is described in following bugzilla:
https://bugzilla.redhat.com/show_bug.cgi?id=487763

Basically here's what's going on. In every mode, bonding interface uses the same
mac address for all enslaved devices (except fail_over_mac). Only balance-alb
will simultaneously use multiple MAC addresses across different slaves. When you
put this kind of bond device into a bridge it will only add one of mac adresses
into a hash list of mac addresses, say X. This mac address is marked as local.
But this bonding interface also has mac address Y. Now then packet arrives with
destination address Y, this address is not marked as local and the packed looks
like it needs to be forwarded. This packet is then lost which is wrong.

Notice that interfaces can be added and removed from bond while it is in bridge.

This patchset solves this issue in the best way it can be possibly solved. By
adding all mac addresses of all slave devices to the bridge hash list. To carry
these addresses the new list has to be introduced in struct net_device.

Jirka

2009-04-15 08:20:08

by Jiri Pirko

[permalink] [raw]
Subject: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list

This patch introduces a new list in struct net_device and brings a set of
functions to handle the work with device address list. The list is a replacement
for the original dev_addr field and because in some situations there is need to
carry several device addresses with the net device. To be backward compatible,
dev_addr is made to point to the first member of the list so original drivers
sees no difference.

Note: patch adding list_first_entry_rcu (currently in Ingo's tip tree) needed.

Signed-off-by: Jiri Pirko <[email protected]>
---
include/linux/etherdevice.h | 24 ++++
include/linux/netdevice.h | 31 +++++-
net/core/dev.c | 262 +++++++++++++++++++++++++++++++++++++++++++
3 files changed, 315 insertions(+), 2 deletions(-)

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index a1f17ab..348a75e 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -205,4 +205,28 @@ static inline int compare_ether_header(const void *a, const void *b)
(a32[1] ^ b32[1]) | (a32[2] ^ b32[2]);
}

+/**
+ * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
+ * @dev: Pointer to a device structure
+ * @addr: Pointer to a six-byte array containing the Ethernet address
+ *
+ * Compare passed address with all addresses of the device. Return true if the
+ * address if one of the device addresses.
+ */
+static inline bool is_etherdev_addr(const struct net_device *dev,
+ const u8 *addr)
+{
+ struct netdev_hw_addr *ha;
+ int res = 1;
+
+ rcu_read_lock();
+ for_each_dev_addr(dev, ha) {
+ res = compare_ether_addr(addr, ha->addr);
+ if (!res)
+ break;
+ }
+ rcu_read_unlock();
+ return !res;
+}
+
#endif /* _LINUX_ETHERDEVICE_H */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2e7783f..77abfdf 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -210,6 +210,12 @@ struct dev_addr_list
#define dmi_users da_users
#define dmi_gusers da_gusers

+struct netdev_hw_addr {
+ struct list_head list;
+ unsigned char addr[MAX_ADDR_LEN];
+ int refcount;
+};
+
struct hh_cache
{
struct hh_cache *hh_next; /* Next entry */
@@ -776,8 +782,11 @@ struct net_device
*/
unsigned long last_rx; /* Time of last Rx */
/* Interface address info used in eth_type_trans() */
- unsigned char dev_addr[MAX_ADDR_LEN]; /* hw address, (before bcast
- because most packets are unicast) */
+ unsigned char *dev_addr; /* hw address, (before bcast
+ because most packets are
+ unicast) */
+
+ struct list_head dev_addr_list; /* list of device hw addresses */

unsigned char broadcast[MAX_ADDR_LEN]; /* hw bcast add */

@@ -1778,6 +1787,13 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
spin_unlock_bh(&dev->addr_list_lock);
}

+/*
+ * dev_addr_list walker. Should be used only for read access. Call with
+ * rcu_read_lock held.
+ */
+#define for_each_dev_addr(dev, ha) \
+ list_for_each_entry_rcu(ha, &dev->dev_addr_list, list)
+
/* These functions live elsewhere (drivers/net/net_init.c, but related) */

extern void ether_setup(struct net_device *dev);
@@ -1790,6 +1806,17 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
alloc_netdev_mq(sizeof_priv, name, setup, 1)
extern int register_netdev(struct net_device *dev);
extern void unregister_netdev(struct net_device *dev);
+
+/* Functions used for device addresses handling */
+extern int dev_addr_add(struct net_device *dev,
+ unsigned char *addr);
+extern int dev_addr_del(struct net_device *dev,
+ unsigned char *addr);
+extern int dev_addr_add_multiple(struct net_device *to_dev,
+ struct net_device *from_dev);
+extern int dev_addr_del_multiple(struct net_device *to_dev,
+ struct net_device *from_dev);
+
/* Functions used for secondary unicast and multicast support */
extern void dev_set_rx_mode(struct net_device *dev);
extern void __dev_set_rx_mode(struct net_device *dev);
diff --git a/net/core/dev.c b/net/core/dev.c
index 91d792d..f77b5e6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3437,6 +3437,261 @@ void dev_set_rx_mode(struct net_device *dev)
netif_addr_unlock_bh(dev);
}

+/* hw addresses list handling functions */
+
+static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
+ int addr_len, int ignore_index)
+{
+ struct netdev_hw_addr *ha;
+ int i = 0;
+
+ if (addr_len > MAX_ADDR_LEN)
+ return -EINVAL;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ha, list, list) {
+ if (i++ != ignore_index &&
+ !memcmp(ha->addr, addr, addr_len)) {
+ ha->refcount++;
+ return 0;
+ }
+ }
+ rcu_read_unlock();
+
+ ha = kzalloc(sizeof(*ha), GFP_ATOMIC);
+ if (!ha)
+ return -ENOMEM;
+ memcpy(ha->addr, addr, addr_len);
+ ha->refcount = 1;
+ list_add_tail_rcu(&ha->list, list);
+ return 0;
+}
+
+static int __hw_addr_add(struct list_head *list, unsigned char *addr,
+ int addr_len)
+{
+ return __hw_addr_add_ii(list, addr, addr_len, -1);
+}
+
+static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
+ int addr_len, int ignore_index)
+{
+ struct netdev_hw_addr *ha;
+ int i = 0;
+
+ list_for_each_entry(ha, list, list) {
+ if (i++ != ignore_index &&
+ !memcmp(ha->addr, addr, addr_len)) {
+ if (--ha->refcount)
+ return 0;
+ list_del_rcu(&ha->list);
+ synchronize_rcu();
+ kfree(ha);
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+static int __hw_addr_del(struct list_head *list, unsigned char *addr,
+ int addr_len)
+{
+ return __hw_addr_del_ii(list, addr, addr_len, -1);
+}
+
+static int __hw_addr_add_multiple_ii(struct list_head *to_list,
+ struct list_head *from_list,
+ int addr_len, int ignore_index)
+{
+ int err = 0;
+ struct netdev_hw_addr *ha, *ha2;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ha, from_list, list) {
+ err = __hw_addr_add_ii(to_list, ha->addr, addr_len, 0);
+ if (err)
+ goto unroll;
+ }
+ goto unlock;
+unroll:
+ list_for_each_entry_rcu(ha2, from_list, list) {
+ if (ha2 == ha)
+ break;
+ __hw_addr_del_ii(to_list, ha2->addr, addr_len, 0);
+ }
+unlock:
+ rcu_read_unlock();
+ return err;
+}
+
+static int __hw_addr_add_multiple(struct list_head *to_list,
+ struct list_head *from_list,
+ int addr_len)
+{
+ return __hw_addr_add_multiple_ii(to_list, from_list, addr_len, -1);
+}
+
+static void __hw_addr_del_multiple_ii(struct list_head *to_list,
+ struct list_head *from_list,
+ int addr_len, int ignore_index)
+{
+ struct netdev_hw_addr *ha;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ha, from_list, list) {
+ __hw_addr_del_ii(to_list, ha->addr, addr_len, 0);
+ }
+ rcu_read_unlock();
+}
+
+static void __hw_addr_del_multiple(struct list_head *to_list,
+ struct list_head *from_list,
+ int addr_len)
+{
+ __hw_addr_del_multiple_ii(to_list, from_list, addr_len, -1);
+}
+
+static void __hw_addr_flush(struct list_head *list)
+{
+ struct netdev_hw_addr *ha, *tmp;
+
+ list_for_each_entry_safe(ha, tmp, list, list) {
+ list_del_rcu(&ha->list);
+ synchronize_rcu();
+ kfree(ha);
+ }
+}
+
+/* Device addresses handling functions */
+
+static void dev_addr_flush(struct net_device *dev)
+{
+ __hw_addr_flush(&dev->dev_addr_list);
+ dev->dev_addr = NULL;
+}
+
+static int dev_addr_init(struct net_device *dev)
+{
+ unsigned char addr[MAX_ADDR_LEN];
+ struct netdev_hw_addr *ha;
+ int err;
+
+ INIT_LIST_HEAD(&dev->dev_addr_list);
+ memset(addr, 0, sizeof(*addr));
+ err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
+ if (!err) {
+ /*
+ * Get the first (previously created) address from the list
+ * and set dev_addr pointer to this location.
+ */
+ rcu_read_lock();
+ ha = list_first_entry_rcu(&dev->dev_addr_list,
+ struct netdev_hw_addr, list);
+ dev->dev_addr = ha->addr;
+ rcu_read_unlock();
+ }
+ return err;
+}
+
+/**
+ * dev_addr_add - Add a device address
+ * @dev: device
+ * @addr: address to add
+ *
+ * Add a device address to the device or increase the reference count if
+ * it already exists.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add(struct net_device *dev, unsigned char *addr)
+{
+ int err;
+
+ ASSERT_RTNL();
+
+ err = __hw_addr_add_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
+ if (!err)
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_addr_add);
+
+/**
+ * dev_addr_del - Release a device address.
+ * @dev: device
+ * @addr: address to delete
+ *
+ * Release reference to a device address and remove it from the device
+ * if the reference count drops to zero.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del(struct net_device *dev, unsigned char *addr)
+{
+ int err;
+
+ ASSERT_RTNL();
+
+ err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
+ if (!err)
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_addr_del);
+
+/**
+ * dev_addr_add_multiple - Add device addresses from another device
+ * @to_dev: device to which addresses will be added
+ * @from_dev: device from which addresses will be added
+ *
+ * Add device addresses of the one device to another.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add_multiple(struct net_device *to_dev,
+ struct net_device *from_dev)
+{
+ int err;
+
+ ASSERT_RTNL();
+
+ if (from_dev->addr_len != to_dev->addr_len)
+ return -EINVAL;
+ err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+ &from_dev->dev_addr_list,
+ to_dev->addr_len, 0);
+ if (!err)
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_addr_add_multiple);
+
+/**
+ * dev_addr_del_multiple - Delete device addresses by another device
+ * @to_dev: device where the addresses will be deleted
+ * @from_dev: device by which addresses the addresses will be deleted
+ *
+ * Deletes addresses in to device by the list of addresses in from device.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del_multiple(struct net_device *to_dev,
+ struct net_device *from_dev)
+{
+ ASSERT_RTNL();
+
+ if (from_dev->addr_len != to_dev->addr_len)
+ return -EINVAL;
+ __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+ &from_dev->dev_addr_list,
+ to_dev->addr_len, 0);
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+ return 0;
+}
+EXPORT_SYMBOL(dev_addr_del_multiple);
+
+/* unicast and multicast addresses handling functions */
+
int __dev_addr_delete(struct dev_addr_list **list, int *count,
void *addr, int alen, int glbl)
{
@@ -4257,6 +4512,9 @@ static void rollback_registered(struct net_device *dev)
*/
dev_addr_discard(dev);

+ /* Flush device addresses */
+ dev_addr_flush(dev);
+
if (dev->netdev_ops->ndo_uninit)
dev->netdev_ops->ndo_uninit(dev);

@@ -4779,6 +5037,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,

dev->gso_max_size = GSO_MAX_SIZE;

+ dev_addr_init(dev);
netdev_init_queues(dev);

INIT_LIST_HEAD(&dev->napi_list);
@@ -4965,6 +5224,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
*/
dev_addr_discard(dev);

+ /* Flush device addresses */
+ dev_addr_flush(dev);
+
netdev_unregister_kobject(dev);

/* Actually switch the network namespace */
--
1.6.0.6

2009-04-15 08:23:26

by Jiri Pirko

[permalink] [raw]
Subject: [PATCH 2/3] net: bridge: use device address list instead of dev_addr

This patch changes the handling of mac addresses of bridge port devices. Now
it uses previously introduced list of device addresses. It allows the bridge to
know more then one local mac address per port which is mandatory for the right
work in some cases.

Signed-off-by: Jiri Pirko <[email protected]>
---
net/bridge/br_fdb.c | 101 ++++++++++++++++++++++++++++++----------------
net/bridge/br_if.c | 2 +-
net/bridge/br_notify.c | 2 +-
net/bridge/br_private.h | 4 +-
4 files changed, 70 insertions(+), 39 deletions(-)

diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index a48f5ef..1e63f76 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -77,10 +77,26 @@ static inline void fdb_delete(struct net_bridge_fdb_entry *f)
br_fdb_put(f);
}

-void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
+static bool another_port_has_addr(const struct net_bridge_port *p,
+ struct net_bridge_fdb_entry *f)
+{
+ struct net_bridge *br = p->br;
+ struct net_bridge_port *op;
+
+ list_for_each_entry(op, &br->port_list, list) {
+ if (op != p && is_etherdev_addr(op->dev, f->addr.addr)) {
+ f->dst = op;
+ return 1;
+ }
+ }
+ return 0;
+}
+
+void br_fdb_changeaddr(struct net_bridge_port *p, struct net_device *dev)
{
struct net_bridge *br = p->br;
int i;
+ struct netdev_hw_addr *ha;

spin_lock_bh(&br->hash_lock);

@@ -92,26 +108,23 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)

f = hlist_entry(h, struct net_bridge_fdb_entry, hlist);
if (f->dst == p && f->is_local) {
- /* maybe another port has same hw addr? */
- struct net_bridge_port *op;
- list_for_each_entry(op, &br->port_list, list) {
- if (op != p &&
- !compare_ether_addr(op->dev->dev_addr,
- f->addr.addr)) {
- f->dst = op;
- goto insert;
- }
- }
-
- /* delete old one */
- fdb_delete(f);
- goto insert;
+ /*
+ * maybe another port has same hw addr?,
+ * if not then delete it
+ */
+ if (!another_port_has_addr(p, f))
+ fdb_delete(f);
}
}
}
- insert:
- /* insert new address, may fail if invalid address or dup. */
- fdb_insert(br, p, newaddr);
+
+ /* insert device addresses, may fail if invalid address. */
+
+ rcu_read_lock();
+ for_each_dev_addr(dev, ha) {
+ fdb_insert(br, p, ha->addr);
+ }
+ rcu_read_unlock();

spin_unlock_bh(&br->hash_lock);
}
@@ -189,20 +202,9 @@ void br_fdb_delete_by_port(struct net_bridge *br,
* then when one port is deleted, assign
* the local entry to other port
*/
- if (f->is_local) {
- struct net_bridge_port *op;
- list_for_each_entry(op, &br->port_list, list) {
- if (op != p &&
- !compare_ether_addr(op->dev->dev_addr,
- f->addr.addr)) {
- f->dst = op;
- goto skip_delete;
- }
- }
- }
-
- fdb_delete(f);
- skip_delete: ;
+ if (!f->is_local ||
+ !another_port_has_addr(p, f))
+ fdb_delete(f);
}
}
spin_unlock_bh(&br->hash_lock);
@@ -338,7 +340,7 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
}

static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
- const unsigned char *addr)
+ const unsigned char *addr)
{
struct hlist_head *head = &br->hash[br_mac_hash(addr)];
struct net_bridge_fdb_entry *fdb;
@@ -366,13 +368,42 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
return 0;
}

+static int fdb_insert_dev(struct net_bridge *br, struct net_bridge_port *source,
+ struct net_device *dev)
+{
+ struct netdev_hw_addr *ha, *ha2;
+ struct net_bridge_fdb_entry *fdb;
+ struct hlist_head *head;
+ int ret = 0;
+
+ rcu_read_lock();
+ for_each_dev_addr(dev, ha) {
+ ret = fdb_insert(br, source, ha->addr);
+ if (ret)
+ goto unroll;
+ }
+ goto unlock;
+unroll:
+ for_each_dev_addr(dev, ha2) {
+ if (ha2 == ha)
+ break;
+ head = &br->hash[br_mac_hash(ha2->addr)];
+ fdb = fdb_find(head, ha2->addr);
+ if (fdb && fdb->is_local)
+ fdb_delete(fdb);
+ }
+unlock:
+ rcu_read_unlock();
+ return ret;
+}
+
int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
- const unsigned char *addr)
+ struct net_device *dev)
{
int ret;

spin_lock_bh(&br->hash_lock);
- ret = fdb_insert(br, source, addr);
+ ret = fdb_insert_dev(br, source, dev);
spin_unlock_bh(&br->hash_lock);
return ret;
}
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 8a96672..789cb30 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -392,7 +392,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
if (err)
goto err0;

- err = br_fdb_insert(br, p, dev->dev_addr);
+ err = br_fdb_insert(br, p, dev);
if (err)
goto err1;

diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c
index 763a3ec..1423541 100644
--- a/net/bridge/br_notify.c
+++ b/net/bridge/br_notify.c
@@ -48,7 +48,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v

case NETDEV_CHANGEADDR:
spin_lock_bh(&br->lock);
- br_fdb_changeaddr(p, dev->dev_addr);
+ br_fdb_changeaddr(p, dev);
br_stp_recalculate_bridge_id(br);
spin_unlock_bh(&br->lock);
break;
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index b6c3b71..65ffe3d 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -148,7 +148,7 @@ extern int br_fdb_init(void);
extern void br_fdb_fini(void);
extern void br_fdb_flush(struct net_bridge *br);
extern void br_fdb_changeaddr(struct net_bridge_port *p,
- const unsigned char *newaddr);
+ struct net_device *dev);
extern void br_fdb_cleanup(unsigned long arg);
extern void br_fdb_delete_by_port(struct net_bridge *br,
const struct net_bridge_port *p, int do_all);
@@ -161,7 +161,7 @@ extern int br_fdb_fillbuf(struct net_bridge *br, void *buf,
unsigned long count, unsigned long off);
extern int br_fdb_insert(struct net_bridge *br,
struct net_bridge_port *source,
- const unsigned char *addr);
+ struct net_device *dev);
extern void br_fdb_update(struct net_bridge *br,
struct net_bridge_port *source,
const unsigned char *addr);
--
1.6.0.6

2009-04-15 08:24:34

by Jiri Pirko

[permalink] [raw]
Subject: [PATCH 3/3] net: bonding: add slave device addresses in mode alb

When in mode alb, add all device addresses which belong to an enslaved slave
device to the bond device. This ensures that all mac addresses will be
treated as local and bonding in this mode will work fine in bridge.

Signed-off-by: Jiri Pirko <[email protected]>
---
drivers/net/bonding/bond_main.c | 23 ++++++++++++++++++++++-
1 files changed, 22 insertions(+), 1 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 99610f3..4025dd0 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1385,6 +1385,11 @@ static void bond_setup_by_slave(struct net_device *bond_dev,
bond->setup_by_slave = 1;
}

+static bool should_copy_dev_addrs(const struct bonding *bond)
+{
+ return (bond->params.mode == BOND_MODE_ALB);
+}
+
/* enslave device <slave> to bond device <master> */
int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
{
@@ -1510,6 +1515,12 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
*/
new_slave->original_flags = slave_dev->flags;

+ if (should_copy_dev_addrs(bond)) {
+ res = dev_addr_add_multiple(bond_dev, slave_dev);
+ if (res)
+ goto err_free;
+ }
+
/*
* Save slave's original ("permanent") mac address for modes
* that need it, and for restoring it upon release, and then
@@ -1527,7 +1538,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
res = dev_set_mac_address(slave_dev, &addr);
if (res) {
pr_debug("Error %d calling set_mac_address\n", res);
- goto err_free;
+ goto err_remove_dev_addrs;
}
}

@@ -1769,6 +1780,10 @@ err_restore_mac:
dev_set_mac_address(slave_dev, &addr);
}

+err_remove_dev_addrs:
+ if (should_copy_dev_addrs(bond))
+ dev_addr_del_multiple(bond_dev, slave_dev);
+
err_free:
kfree(new_slave);

@@ -1954,6 +1969,9 @@ int bond_release(struct net_device *bond_dev, struct net_device *slave_dev)
/* close slave before restoring its mac address */
dev_close(slave_dev);

+ if (should_copy_dev_addrs(bond))
+ dev_addr_del_multiple(bond_dev, slave_dev);
+
if (bond->params.fail_over_mac != BOND_FOM_ACTIVE) {
/* restore original ("permanent") mac address */
memcpy(addr.sa_data, slave->perm_hwaddr, ETH_ALEN);
@@ -2090,6 +2108,9 @@ static int bond_release_all(struct net_device *bond_dev)
/* close slave before restoring its mac address */
dev_close(slave_dev);

+ if (should_copy_dev_addrs(bond))
+ dev_addr_del_multiple(bond_dev, slave_dev);
+
if (!bond->params.fail_over_mac) {
/* restore original ("permanent") mac address*/
memcpy(addr.sa_data, slave->perm_hwaddr, ETH_ALEN);
--
1.6.0.6

2009-04-15 08:25:39

by Li Zefan

[permalink] [raw]
Subject: Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list

> +static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
> + int addr_len, int ignore_index)
> +{
> + struct netdev_hw_addr *ha;
> + int i = 0;
> +
> + if (addr_len > MAX_ADDR_LEN)
> + return -EINVAL;
> +
> + rcu_read_lock();
> + list_for_each_entry_rcu(ha, list, list) {
> + if (i++ != ignore_index &&
> + !memcmp(ha->addr, addr, addr_len)) {
> + ha->refcount++;
> + return 0;

missing rcu_read_unlock() ?

> + }
> + }
> + rcu_read_unlock();

2009-04-15 08:32:16

by Jiri Pirko

[permalink] [raw]
Subject: Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list

Wed, Apr 15, 2009 at 10:26:04AM CEST, [email protected] wrote:
>> +static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
>> + int addr_len, int ignore_index)
>> +{
>> + struct netdev_hw_addr *ha;
>> + int i = 0;
>> +
>> + if (addr_len > MAX_ADDR_LEN)
>> + return -EINVAL;
>> +
>> + rcu_read_lock();
>> + list_for_each_entry_rcu(ha, list, list) {
>> + if (i++ != ignore_index &&
>> + !memcmp(ha->addr, addr, addr_len)) {
>> + ha->refcount++;
>> + return 0;
>
>missing rcu_read_unlock() ?
>
Sure! Thanks...
>> + }
>> + }
>> + rcu_read_unlock();

2009-04-15 08:34:44

by Jiri Pirko

[permalink] [raw]
Subject: Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list

This patch introduces a new list in struct net_device and brings a set of
functions to handle the work with device address list. The list is a replacement
for the original dev_addr field and because in some situations there is need to
carry several device addresses with the net device. To be backward compatible,
dev_addr is made to point to the first member of the list so original drivers
sees no difference.

Note: patch adding list_first_entry_rcu (currently in Ingo's tip tree) needed.

Signed-off-by: Jiri Pirko <[email protected]>
---
include/linux/etherdevice.h | 24 ++++
include/linux/netdevice.h | 31 +++++-
net/core/dev.c | 263 +++++++++++++++++++++++++++++++++++++++++++
3 files changed, 316 insertions(+), 2 deletions(-)

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index a1f17ab..348a75e 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -205,4 +205,28 @@ static inline int compare_ether_header(const void *a, const void *b)
(a32[1] ^ b32[1]) | (a32[2] ^ b32[2]);
}

+/**
+ * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
+ * @dev: Pointer to a device structure
+ * @addr: Pointer to a six-byte array containing the Ethernet address
+ *
+ * Compare passed address with all addresses of the device. Return true if the
+ * address if one of the device addresses.
+ */
+static inline bool is_etherdev_addr(const struct net_device *dev,
+ const u8 *addr)
+{
+ struct netdev_hw_addr *ha;
+ int res = 1;
+
+ rcu_read_lock();
+ for_each_dev_addr(dev, ha) {
+ res = compare_ether_addr(addr, ha->addr);
+ if (!res)
+ break;
+ }
+ rcu_read_unlock();
+ return !res;
+}
+
#endif /* _LINUX_ETHERDEVICE_H */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2e7783f..77abfdf 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -210,6 +210,12 @@ struct dev_addr_list
#define dmi_users da_users
#define dmi_gusers da_gusers

+struct netdev_hw_addr {
+ struct list_head list;
+ unsigned char addr[MAX_ADDR_LEN];
+ int refcount;
+};
+
struct hh_cache
{
struct hh_cache *hh_next; /* Next entry */
@@ -776,8 +782,11 @@ struct net_device
*/
unsigned long last_rx; /* Time of last Rx */
/* Interface address info used in eth_type_trans() */
- unsigned char dev_addr[MAX_ADDR_LEN]; /* hw address, (before bcast
- because most packets are unicast) */
+ unsigned char *dev_addr; /* hw address, (before bcast
+ because most packets are
+ unicast) */
+
+ struct list_head dev_addr_list; /* list of device hw addresses */

unsigned char broadcast[MAX_ADDR_LEN]; /* hw bcast add */

@@ -1778,6 +1787,13 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
spin_unlock_bh(&dev->addr_list_lock);
}

+/*
+ * dev_addr_list walker. Should be used only for read access. Call with
+ * rcu_read_lock held.
+ */
+#define for_each_dev_addr(dev, ha) \
+ list_for_each_entry_rcu(ha, &dev->dev_addr_list, list)
+
/* These functions live elsewhere (drivers/net/net_init.c, but related) */

extern void ether_setup(struct net_device *dev);
@@ -1790,6 +1806,17 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
alloc_netdev_mq(sizeof_priv, name, setup, 1)
extern int register_netdev(struct net_device *dev);
extern void unregister_netdev(struct net_device *dev);
+
+/* Functions used for device addresses handling */
+extern int dev_addr_add(struct net_device *dev,
+ unsigned char *addr);
+extern int dev_addr_del(struct net_device *dev,
+ unsigned char *addr);
+extern int dev_addr_add_multiple(struct net_device *to_dev,
+ struct net_device *from_dev);
+extern int dev_addr_del_multiple(struct net_device *to_dev,
+ struct net_device *from_dev);
+
/* Functions used for secondary unicast and multicast support */
extern void dev_set_rx_mode(struct net_device *dev);
extern void __dev_set_rx_mode(struct net_device *dev);
diff --git a/net/core/dev.c b/net/core/dev.c
index 91d792d..04cddbb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3437,6 +3437,262 @@ void dev_set_rx_mode(struct net_device *dev)
netif_addr_unlock_bh(dev);
}

+/* hw addresses list handling functions */
+
+static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
+ int addr_len, int ignore_index)
+{
+ struct netdev_hw_addr *ha;
+ int i = 0;
+
+ if (addr_len > MAX_ADDR_LEN)
+ return -EINVAL;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ha, list, list) {
+ if (i++ != ignore_index &&
+ !memcmp(ha->addr, addr, addr_len)) {
+ ha->refcount++;
+ rcu_read_unlock();
+ return 0;
+ }
+ }
+ rcu_read_unlock();
+
+ ha = kzalloc(sizeof(*ha), GFP_ATOMIC);
+ if (!ha)
+ return -ENOMEM;
+ memcpy(ha->addr, addr, addr_len);
+ ha->refcount = 1;
+ list_add_tail_rcu(&ha->list, list);
+ return 0;
+}
+
+static int __hw_addr_add(struct list_head *list, unsigned char *addr,
+ int addr_len)
+{
+ return __hw_addr_add_ii(list, addr, addr_len, -1);
+}
+
+static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
+ int addr_len, int ignore_index)
+{
+ struct netdev_hw_addr *ha;
+ int i = 0;
+
+ list_for_each_entry(ha, list, list) {
+ if (i++ != ignore_index &&
+ !memcmp(ha->addr, addr, addr_len)) {
+ if (--ha->refcount)
+ return 0;
+ list_del_rcu(&ha->list);
+ synchronize_rcu();
+ kfree(ha);
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+static int __hw_addr_del(struct list_head *list, unsigned char *addr,
+ int addr_len)
+{
+ return __hw_addr_del_ii(list, addr, addr_len, -1);
+}
+
+static int __hw_addr_add_multiple_ii(struct list_head *to_list,
+ struct list_head *from_list,
+ int addr_len, int ignore_index)
+{
+ int err = 0;
+ struct netdev_hw_addr *ha, *ha2;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ha, from_list, list) {
+ err = __hw_addr_add_ii(to_list, ha->addr, addr_len, 0);
+ if (err)
+ goto unroll;
+ }
+ goto unlock;
+unroll:
+ list_for_each_entry_rcu(ha2, from_list, list) {
+ if (ha2 == ha)
+ break;
+ __hw_addr_del_ii(to_list, ha2->addr, addr_len, 0);
+ }
+unlock:
+ rcu_read_unlock();
+ return err;
+}
+
+static int __hw_addr_add_multiple(struct list_head *to_list,
+ struct list_head *from_list,
+ int addr_len)
+{
+ return __hw_addr_add_multiple_ii(to_list, from_list, addr_len, -1);
+}
+
+static void __hw_addr_del_multiple_ii(struct list_head *to_list,
+ struct list_head *from_list,
+ int addr_len, int ignore_index)
+{
+ struct netdev_hw_addr *ha;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ha, from_list, list) {
+ __hw_addr_del_ii(to_list, ha->addr, addr_len, 0);
+ }
+ rcu_read_unlock();
+}
+
+static void __hw_addr_del_multiple(struct list_head *to_list,
+ struct list_head *from_list,
+ int addr_len)
+{
+ __hw_addr_del_multiple_ii(to_list, from_list, addr_len, -1);
+}
+
+static void __hw_addr_flush(struct list_head *list)
+{
+ struct netdev_hw_addr *ha, *tmp;
+
+ list_for_each_entry_safe(ha, tmp, list, list) {
+ list_del_rcu(&ha->list);
+ synchronize_rcu();
+ kfree(ha);
+ }
+}
+
+/* Device addresses handling functions */
+
+static void dev_addr_flush(struct net_device *dev)
+{
+ __hw_addr_flush(&dev->dev_addr_list);
+ dev->dev_addr = NULL;
+}
+
+static int dev_addr_init(struct net_device *dev)
+{
+ unsigned char addr[MAX_ADDR_LEN];
+ struct netdev_hw_addr *ha;
+ int err;
+
+ INIT_LIST_HEAD(&dev->dev_addr_list);
+ memset(addr, 0, sizeof(*addr));
+ err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
+ if (!err) {
+ /*
+ * Get the first (previously created) address from the list
+ * and set dev_addr pointer to this location.
+ */
+ rcu_read_lock();
+ ha = list_first_entry_rcu(&dev->dev_addr_list,
+ struct netdev_hw_addr, list);
+ dev->dev_addr = ha->addr;
+ rcu_read_unlock();
+ }
+ return err;
+}
+
+/**
+ * dev_addr_add - Add a device address
+ * @dev: device
+ * @addr: address to add
+ *
+ * Add a device address to the device or increase the reference count if
+ * it already exists.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add(struct net_device *dev, unsigned char *addr)
+{
+ int err;
+
+ ASSERT_RTNL();
+
+ err = __hw_addr_add_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
+ if (!err)
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_addr_add);
+
+/**
+ * dev_addr_del - Release a device address.
+ * @dev: device
+ * @addr: address to delete
+ *
+ * Release reference to a device address and remove it from the device
+ * if the reference count drops to zero.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del(struct net_device *dev, unsigned char *addr)
+{
+ int err;
+
+ ASSERT_RTNL();
+
+ err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
+ if (!err)
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_addr_del);
+
+/**
+ * dev_addr_add_multiple - Add device addresses from another device
+ * @to_dev: device to which addresses will be added
+ * @from_dev: device from which addresses will be added
+ *
+ * Add device addresses of the one device to another.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add_multiple(struct net_device *to_dev,
+ struct net_device *from_dev)
+{
+ int err;
+
+ ASSERT_RTNL();
+
+ if (from_dev->addr_len != to_dev->addr_len)
+ return -EINVAL;
+ err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+ &from_dev->dev_addr_list,
+ to_dev->addr_len, 0);
+ if (!err)
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_addr_add_multiple);
+
+/**
+ * dev_addr_del_multiple - Delete device addresses by another device
+ * @to_dev: device where the addresses will be deleted
+ * @from_dev: device by which addresses the addresses will be deleted
+ *
+ * Deletes addresses in to device by the list of addresses in from device.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del_multiple(struct net_device *to_dev,
+ struct net_device *from_dev)
+{
+ ASSERT_RTNL();
+
+ if (from_dev->addr_len != to_dev->addr_len)
+ return -EINVAL;
+ __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+ &from_dev->dev_addr_list,
+ to_dev->addr_len, 0);
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+ return 0;
+}
+EXPORT_SYMBOL(dev_addr_del_multiple);
+
+/* unicast and multicast addresses handling functions */
+
int __dev_addr_delete(struct dev_addr_list **list, int *count,
void *addr, int alen, int glbl)
{
@@ -4257,6 +4513,9 @@ static void rollback_registered(struct net_device *dev)
*/
dev_addr_discard(dev);

+ /* Flush device addresses */
+ dev_addr_flush(dev);
+
if (dev->netdev_ops->ndo_uninit)
dev->netdev_ops->ndo_uninit(dev);

@@ -4779,6 +5038,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,

dev->gso_max_size = GSO_MAX_SIZE;

+ dev_addr_init(dev);
netdev_init_queues(dev);

INIT_LIST_HEAD(&dev->napi_list);
@@ -4965,6 +5225,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
*/
dev_addr_discard(dev);

+ /* Flush device addresses */
+ dev_addr_flush(dev);
+
netdev_unregister_kobject(dev);

/* Actually switch the network namespace */
--
1.6.0.6

2009-04-15 09:21:53

by David Miller

[permalink] [raw]
Subject: Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list

From: Jiri Pirko <[email protected]>
Date: Wed, 15 Apr 2009 10:32:24 +0200

> This patch introduces a new list in struct net_device and brings a set of
> functions to handle the work with device address list. The list is a replacement
> for the original dev_addr field and because in some situations there is need to
> carry several device addresses with the net device. To be backward compatible,
> dev_addr is made to point to the first member of the list so original drivers
> sees no difference.
>
> Note: patch adding list_first_entry_rcu (currently in Ingo's tip tree) needed.
>
> Signed-off-by: Jiri Pirko <[email protected]>

Jiri, please add some distinguishing text to your subject lines when
you post fixed up version of patches. Like "v2" or something like
that, and make a note under the commit message of the changes you've
made from the previous version.

Otherwise I think it's a dup (because I get a thousand copies anyways)
and will just delete it both in my inbox and on patchwork.

Thanks.

2009-04-15 09:31:06

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list

Jiri Pirko a ?crit :
> This patch introduces a new list in struct net_device and brings a set of
> functions to handle the work with device address list. The list is a replacement
> for the original dev_addr field and because in some situations there is need to
> carry several device addresses with the net device. To be backward compatible,
> dev_addr is made to point to the first member of the list so original drivers
> sees no difference.
>

You see no difference ? Please look more closely...

I see one additional dereference in hot path, to small objects possibly
with false sharing effects.

So I would advise not changing dev_addr[] to a pointer.
And instead copy first netdev_hw_addr into it.

Also, doing a kzalloc(sizeof(struct netdev_hw_addr)) for allocating these structs
might give a block of memory < L1_CACHE_SIZE so kernel is free to give other
part of this cache line to some other layer that could be a hot spot, so
false sharing could happen.

kzalloc(max(sizeof(*ha), L1_CACHE_SIZE)) is thus higly recommended here.

> Note: patch adding list_first_entry_rcu (currently in Ingo's tip tree) needed.
>
> Signed-off-by: Jiri Pirko <[email protected]>
> ---
> include/linux/etherdevice.h | 24 ++++
> include/linux/netdevice.h | 31 +++++-
> net/core/dev.c | 263 +++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 316 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
> index a1f17ab..348a75e 100644
> --- a/include/linux/etherdevice.h
> +++ b/include/linux/etherdevice.h
> @@ -205,4 +205,28 @@ static inline int compare_ether_header(const void *a, const void *b)
> (a32[1] ^ b32[1]) | (a32[2] ^ b32[2]);
> }
>
> +/**
> + * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
> + * @dev: Pointer to a device structure
> + * @addr: Pointer to a six-byte array containing the Ethernet address
> + *
> + * Compare passed address with all addresses of the device. Return true if the
> + * address if one of the device addresses.
> + */
> +static inline bool is_etherdev_addr(const struct net_device *dev,
> + const u8 *addr)
> +{
> + struct netdev_hw_addr *ha;
> + int res = 1;
> +
> + rcu_read_lock();
> + for_each_dev_addr(dev, ha) {
> + res = compare_ether_addr(addr, ha->addr);

compare_ether_addr_64bits() please ?

> + if (!res)
> + break;
> + }
> + rcu_read_unlock();
> + return !res;
> +}
> +
> #endif /* _LINUX_ETHERDEVICE_H */
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 2e7783f..77abfdf 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -210,6 +210,12 @@ struct dev_addr_list
> #define dmi_users da_users
> #define dmi_gusers da_gusers
>
> +struct netdev_hw_addr {
> + struct list_head list;
> + unsigned char addr[MAX_ADDR_LEN];
> + int refcount;
> +};
> +
> struct hh_cache
> {
> struct hh_cache *hh_next; /* Next entry */
> @@ -776,8 +782,11 @@ struct net_device
> */
> unsigned long last_rx; /* Time of last Rx */
> /* Interface address info used in eth_type_trans() */
> - unsigned char dev_addr[MAX_ADDR_LEN]; /* hw address, (before bcast
> - because most packets are unicast) */
> + unsigned char *dev_addr; /* hw address, (before bcast
> + because most packets are
> + unicast) */
> +
> + struct list_head dev_addr_list; /* list of device hw addresses */
>
> unsigned char broadcast[MAX_ADDR_LEN]; /* hw bcast add */
>
> @@ -1778,6 +1787,13 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
> spin_unlock_bh(&dev->addr_list_lock);
> }
>
> +/*
> + * dev_addr_list walker. Should be used only for read access. Call with
> + * rcu_read_lock held.
> + */
> +#define for_each_dev_addr(dev, ha) \
> + list_for_each_entry_rcu(ha, &dev->dev_addr_list, list)
> +
> /* These functions live elsewhere (drivers/net/net_init.c, but related) */
>
> extern void ether_setup(struct net_device *dev);
> @@ -1790,6 +1806,17 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
> alloc_netdev_mq(sizeof_priv, name, setup, 1)
> extern int register_netdev(struct net_device *dev);
> extern void unregister_netdev(struct net_device *dev);
> +
> +/* Functions used for device addresses handling */
> +extern int dev_addr_add(struct net_device *dev,
> + unsigned char *addr);
> +extern int dev_addr_del(struct net_device *dev,
> + unsigned char *addr);
> +extern int dev_addr_add_multiple(struct net_device *to_dev,
> + struct net_device *from_dev);
> +extern int dev_addr_del_multiple(struct net_device *to_dev,
> + struct net_device *from_dev);
> +
> /* Functions used for secondary unicast and multicast support */
> extern void dev_set_rx_mode(struct net_device *dev);
> extern void __dev_set_rx_mode(struct net_device *dev);
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 91d792d..04cddbb 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3437,6 +3437,262 @@ void dev_set_rx_mode(struct net_device *dev)
> netif_addr_unlock_bh(dev);
> }
>
> +/* hw addresses list handling functions */
> +
> +static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
> + int addr_len, int ignore_index)
> +{
> + struct netdev_hw_addr *ha;
> + int i = 0;
> +
> + if (addr_len > MAX_ADDR_LEN)
> + return -EINVAL;
> +
> + rcu_read_lock();

This locking is highly suspect.

> + list_for_each_entry_rcu(ha, list, list) {
> + if (i++ != ignore_index &&
> + !memcmp(ha->addr, addr, addr_len)) {
> + ha->refcount++;
> + rcu_read_unlock();
> + return 0;
> + }
> + }
> + rcu_read_unlock();

Since you obviously need a write lock here to be sure following
can be done by one cpu only.

You have same problem all over this patch.

> +
> + ha = kzalloc(sizeof(*ha), GFP_ATOMIC);

kzalloc(max(sizeof(*ha), L1_CACHE_SIZE), GFP_...) is thus higly recommended here.

Also, why GFP_ATOMIC is needed here ?

> + if (!ha)
> + return -ENOMEM;
> + memcpy(ha->addr, addr, addr_len);
> + ha->refcount = 1;
> + list_add_tail_rcu(&ha->list, list);
> + return 0;
> +}
> +
> +static int __hw_addr_add(struct list_head *list, unsigned char *addr,
> + int addr_len)
> +{
> + return __hw_addr_add_ii(list, addr, addr_len, -1);
> +}
> +
> +static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
> + int addr_len, int ignore_index)
> +{
> + struct netdev_hw_addr *ha;
> + int i = 0;
> +
> + list_for_each_entry(ha, list, list) {
> + if (i++ != ignore_index &&
> + !memcmp(ha->addr, addr, addr_len)) {
> + if (--ha->refcount)
> + return 0;
> + list_del_rcu(&ha->list);
> + synchronize_rcu();

Oh well... I'm pretty sure this synchronize_rcu() call can be avoided,
dont you think ? Check kfree_rcu() or equivalent, as it seems not yet
included in current kernels...

> + kfree(ha);
> + return 0;
> + }
> + }
> + return -ENOENT;
> +}
> +
> +static int __hw_addr_del(struct list_head *list, unsigned char *addr,
> + int addr_len)
> +{
> + return __hw_addr_del_ii(list, addr, addr_len, -1);
> +}
> +
> +static int __hw_addr_add_multiple_ii(struct list_head *to_list,
> + struct list_head *from_list,
> + int addr_len, int ignore_index)
> +{
> + int err = 0;
> + struct netdev_hw_addr *ha, *ha2;
> +
> + rcu_read_lock();
> + list_for_each_entry_rcu(ha, from_list, list) {
> + err = __hw_addr_add_ii(to_list, ha->addr, addr_len, 0);
> + if (err)
> + goto unroll;
> + }
> + goto unlock;
> +unroll:
> + list_for_each_entry_rcu(ha2, from_list, list) {
> + if (ha2 == ha)
> + break;
> + __hw_addr_del_ii(to_list, ha2->addr, addr_len, 0);
> + }
> +unlock:
> + rcu_read_unlock();
> + return err;
> +}
> +
> +static int __hw_addr_add_multiple(struct list_head *to_list,
> + struct list_head *from_list,
> + int addr_len)
> +{
> + return __hw_addr_add_multiple_ii(to_list, from_list, addr_len, -1);
> +}
> +
> +static void __hw_addr_del_multiple_ii(struct list_head *to_list,
> + struct list_head *from_list,
> + int addr_len, int ignore_index)
> +{
> + struct netdev_hw_addr *ha;
> +
> + rcu_read_lock();
> + list_for_each_entry_rcu(ha, from_list, list) {
> + __hw_addr_del_ii(to_list, ha->addr, addr_len, 0);
> + }
> + rcu_read_unlock();
> +}
> +
> +static void __hw_addr_del_multiple(struct list_head *to_list,
> + struct list_head *from_list,
> + int addr_len)
> +{
> + __hw_addr_del_multiple_ii(to_list, from_list, addr_len, -1);
> +}
> +
> +static void __hw_addr_flush(struct list_head *list)
> +{
> + struct netdev_hw_addr *ha, *tmp;
> +
> + list_for_each_entry_safe(ha, tmp, list, list) {
> + list_del_rcu(&ha->list);
> + synchronize_rcu();

Oh no... :(

> + kfree(ha);
> + }
> +}
> +
> +/* Device addresses handling functions */
> +
> +static void dev_addr_flush(struct net_device *dev)
> +{
> + __hw_addr_flush(&dev->dev_addr_list);
> + dev->dev_addr = NULL;
> +}
> +
> +static int dev_addr_init(struct net_device *dev)
> +{
> + unsigned char addr[MAX_ADDR_LEN];
> + struct netdev_hw_addr *ha;
> + int err;
> +
> + INIT_LIST_HEAD(&dev->dev_addr_list);
> + memset(addr, 0, sizeof(*addr));
> + err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
> + if (!err) {
> + /*
> + * Get the first (previously created) address from the list
> + * and set dev_addr pointer to this location.
> + */
> + rcu_read_lock();

locking is not correct or unnecessary

> + ha = list_first_entry_rcu(&dev->dev_addr_list,
> + struct netdev_hw_addr, list);
> + dev->dev_addr = ha->addr;
> + rcu_read_unlock();
> + }
> + return err;
> +}
> +
> +/**
> + * dev_addr_add - Add a device address
> + * @dev: device
> + * @addr: address to add
> + *
> + * Add a device address to the device or increase the reference count if
> + * it already exists.
> + *
> + * The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_add(struct net_device *dev, unsigned char *addr)
> +{
> + int err;
> +
> + ASSERT_RTNL();
> +
> + err = __hw_addr_add_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
> + if (!err)
> + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
> + return err;
> +}
> +EXPORT_SYMBOL(dev_addr_add);
> +
> +/**
> + * dev_addr_del - Release a device address.
> + * @dev: device
> + * @addr: address to delete
> + *
> + * Release reference to a device address and remove it from the device
> + * if the reference count drops to zero.
> + *
> + * The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_del(struct net_device *dev, unsigned char *addr)
> +{
> + int err;
> +
> + ASSERT_RTNL();
> +
> + err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
> + if (!err)
> + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
> + return err;
> +}
> +EXPORT_SYMBOL(dev_addr_del);
> +
> +/**
> + * dev_addr_add_multiple - Add device addresses from another device
> + * @to_dev: device to which addresses will be added
> + * @from_dev: device from which addresses will be added
> + *
> + * Add device addresses of the one device to another.
> + *
> + * The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_add_multiple(struct net_device *to_dev,
> + struct net_device *from_dev)
> +{
> + int err;
> +
> + ASSERT_RTNL();
> +
> + if (from_dev->addr_len != to_dev->addr_len)
> + return -EINVAL;
> + err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
> + &from_dev->dev_addr_list,
> + to_dev->addr_len, 0);
> + if (!err)
> + call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
> + return err;
> +}
> +EXPORT_SYMBOL(dev_addr_add_multiple);
> +
> +/**
> + * dev_addr_del_multiple - Delete device addresses by another device
> + * @to_dev: device where the addresses will be deleted
> + * @from_dev: device by which addresses the addresses will be deleted
> + *
> + * Deletes addresses in to device by the list of addresses in from device.
> + *
> + * The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_del_multiple(struct net_device *to_dev,
> + struct net_device *from_dev)
> +{
> + ASSERT_RTNL();
> +
> + if (from_dev->addr_len != to_dev->addr_len)
> + return -EINVAL;
> + __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
> + &from_dev->dev_addr_list,
> + to_dev->addr_len, 0);
> + call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
> + return 0;
> +}
> +EXPORT_SYMBOL(dev_addr_del_multiple);
> +
> +/* unicast and multicast addresses handling functions */
> +
> int __dev_addr_delete(struct dev_addr_list **list, int *count,
> void *addr, int alen, int glbl)
> {
> @@ -4257,6 +4513,9 @@ static void rollback_registered(struct net_device *dev)
> */
> dev_addr_discard(dev);
>
> + /* Flush device addresses */
> + dev_addr_flush(dev);
> +
> if (dev->netdev_ops->ndo_uninit)
> dev->netdev_ops->ndo_uninit(dev);
>
> @@ -4779,6 +5038,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
>
> dev->gso_max_size = GSO_MAX_SIZE;
>
> + dev_addr_init(dev);
> netdev_init_queues(dev);
>
> INIT_LIST_HEAD(&dev->napi_list);
> @@ -4965,6 +5225,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
> */
> dev_addr_discard(dev);
>
> + /* Flush device addresses */
> + dev_addr_flush(dev);
> +
> netdev_unregister_kobject(dev);
>
> /* Actually switch the network namespace */

2009-04-15 09:31:44

by David Miller

[permalink] [raw]
Subject: Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list

From: Eric Dumazet <[email protected]>
Date: Wed, 15 Apr 2009 11:27:50 +0200

> Since you obviously need a write lock here to be sure following
> can be done by one cpu only.
>
> You have same problem all over this patch.

RTNL semaphore is held across all modification operations.

2009-04-15 10:15:29

by Patrick McHardy

[permalink] [raw]
Subject: Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list

David Miller wrote:
> From: Eric Dumazet <[email protected]>
> Date: Wed, 15 Apr 2009 11:27:50 +0200
>
>> Since you obviously need a write lock here to be sure following
>> can be done by one cpu only.
>>
>> You have same problem all over this patch.
>
> RTNL semaphore is held across all modification operations.

If this will also be used for multicast lists, changes can happen
(IPv6) without the RTNL.

2009-04-15 10:16:11

by David Miller

[permalink] [raw]
Subject: Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list

From: Patrick McHardy <[email protected]>
Date: Wed, 15 Apr 2009 12:13:57 +0200

> David Miller wrote:
>> From: Eric Dumazet <[email protected]>
>> Date: Wed, 15 Apr 2009 11:27:50 +0200
>>
>>> Since you obviously need a write lock here to be sure following
>>> can be done by one cpu only.
>>>
>>> You have same problem all over this patch.
>> RTNL semaphore is held across all modification operations.
>
> If this will also be used for multicast lists, changes can happen
> (IPv6) without the RTNL.

Indeed, that is true :-/

2009-04-15 10:41:30

by Patrick McHardy

[permalink] [raw]
Subject: Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list

David Miller wrote:
> From: Patrick McHardy <[email protected]>
> Date: Wed, 15 Apr 2009 12:13:57 +0200
>
>> David Miller wrote:
>>> From: Eric Dumazet <[email protected]>
>>> Date: Wed, 15 Apr 2009 11:27:50 +0200
>>>
>>>> Since you obviously need a write lock here to be sure following
>>>> can be done by one cpu only.
>>>>
>>>> You have same problem all over this patch.
>>> RTNL semaphore is held across all modification operations.
>> If this will also be used for multicast lists, changes can happen
>> (IPv6) without the RTNL.
>
> Indeed, that is true :-/

Herbert (I think) suggested to make address list updates in softirq
context a two-step process, where addresses would first be added to
a temporary list and the final change would be done in process context
while holding the RTNL.

Given the complicated mess we currently have, this would be a very
worthwhile change IMO.

2009-04-15 10:45:38

by David Miller

[permalink] [raw]
Subject: Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list

From: Patrick McHardy <[email protected]>
Date: Wed, 15 Apr 2009 12:41:01 +0200

> Herbert (I think) suggested to make address list updates in softirq
> context a two-step process, where addresses would first be added to
> a temporary list and the final change would be done in process context
> while holding the RTNL.
>
> Given the complicated mess we currently have, this would be a very
> worthwhile change IMO.

This would break the IPV6 TAHI tests if you think we could use
such an idea for that.

When IPV6 packets arrive that influence multicast and unicast
address lists, the effect must be essentially immediate. Such
that a subsequent packet will cause the kernel the behave
with the necessary side effects, no matter how quickly that
next packet arrives.

2009-04-15 10:48:14

by Patrick McHardy

[permalink] [raw]
Subject: Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list

David Miller wrote:
> From: Patrick McHardy <[email protected]>
> Date: Wed, 15 Apr 2009 12:41:01 +0200
>
>> Herbert (I think) suggested to make address list updates in softirq
>> context a two-step process, where addresses would first be added to
>> a temporary list and the final change would be done in process context
>> while holding the RTNL.
>>
>> Given the complicated mess we currently have, this would be a very
>> worthwhile change IMO.
>
> This would break the IPV6 TAHI tests if you think we could use
> such an idea for that.
>
> When IPV6 packets arrive that influence multicast and unicast
> address lists, the effect must be essentially immediate. Such
> that a subsequent packet will cause the kernel the behave
> with the necessary side effects, no matter how quickly that
> next packet arrives.

I see, thanks for the explanation.

2009-04-15 11:20:31

by Jiri Pirko

[permalink] [raw]
Subject: Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list

Wed, Apr 15, 2009 at 11:27:50AM CEST, [email protected] wrote:
>Jiri Pirko a ?crit :
>> This patch introduces a new list in struct net_device and brings a set of
>> functions to handle the work with device address list. The list is a replacement
>> for the original dev_addr field and because in some situations there is need to
>> carry several device addresses with the net device. To be backward compatible,
>> dev_addr is made to point to the first member of the list so original drivers
>> sees no difference.
>>
>
>You see no difference ? Please look more closely...
>
>I see one additional dereference in hot path, to small objects possibly
>with false sharing effects.
>
>So I would advise not changing dev_addr[] to a pointer.
>And instead copy first netdev_hw_addr into it.

Hmm :( That is what I was trying to avoid. If the first netdev_hw_addr in the
list is a copy of dev_addr, then there must be synchronizing of those two. This
would be a pain.. Plus I thought that eventually dev_addr would not be
accessible directly but only by set of macros/inlines to accesse the list, and
then dev_addr would be removed from struct net_device.
>
>Also, doing a kzalloc(sizeof(struct netdev_hw_addr)) for allocating these structs
>might give a block of memory < L1_CACHE_SIZE so kernel is free to give other
>part of this cache line to some other layer that could be a hot spot, so
>false sharing could happen.
>
>kzalloc(max(sizeof(*ha), L1_CACHE_SIZE)) is thus higly recommended here.
You mean PAGE_CACHE_SIZE? I think that would be little wasting... But I see your
point...
>
>> Note: patch adding list_first_entry_rcu (currently in Ingo's tip tree) needed.
>>
>> Signed-off-by: Jiri Pirko <[email protected]>
>> ---
>> include/linux/etherdevice.h | 24 ++++
>> include/linux/netdevice.h | 31 +++++-
>> net/core/dev.c | 263 +++++++++++++++++++++++++++++++++++++++++++
>> 3 files changed, 316 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
>> index a1f17ab..348a75e 100644
>> --- a/include/linux/etherdevice.h
>> +++ b/include/linux/etherdevice.h
>> @@ -205,4 +205,28 @@ static inline int compare_ether_header(const void *a, const void *b)
>> (a32[1] ^ b32[1]) | (a32[2] ^ b32[2]);
>> }
>>
>> +/**
>> + * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
>> + * @dev: Pointer to a device structure
>> + * @addr: Pointer to a six-byte array containing the Ethernet address
>> + *
>> + * Compare passed address with all addresses of the device. Return true if the
>> + * address if one of the device addresses.
>> + */
>> +static inline bool is_etherdev_addr(const struct net_device *dev,
>> + const u8 *addr)
>> +{
>> + struct netdev_hw_addr *ha;
>> + int res = 1;
>> +
>> + rcu_read_lock();
>> + for_each_dev_addr(dev, ha) {
>> + res = compare_ether_addr(addr, ha->addr);
>
>compare_ether_addr_64bits() please ?
>
I used the original as the bridge code used it. Ok, noted.

<snip>

>> +static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
>> + int addr_len, int ignore_index)
>> +{
>> + struct netdev_hw_addr *ha;
>> + int i = 0;
>> +
>> + if (addr_len > MAX_ADDR_LEN)
>> + return -EINVAL;
>> +
>> + rcu_read_lock();
>
>This locking is highly suspect.
>
>> + list_for_each_entry_rcu(ha, list, list) {
>> + if (i++ != ignore_index &&
>> + !memcmp(ha->addr, addr, addr_len)) {
>> + ha->refcount++;
>> + rcu_read_unlock();
>> + return 0;
>> + }
>> + }
>> + rcu_read_unlock();
>
>Since you obviously need a write lock here to be sure following
>can be done by one cpu only.
>
>You have same problem all over this patch.

Yes, as Dave wrote, this is guarded by RTNL mutex.
>
>> +
>> + ha = kzalloc(sizeof(*ha), GFP_ATOMIC);
>
>kzalloc(max(sizeof(*ha), L1_CACHE_SIZE), GFP_...) is thus higly recommended here.
>
>Also, why GFP_ATOMIC is needed here ?

Yes, it is not needed here. I've copied it here from the original unicast and
multicast add funtion to stay close but as I can see, there is no need for it
there either.
Noted.
>

<snip>

>> + list_for_each_entry(ha, list, list) {
>> + if (i++ != ignore_index &&
>> + !memcmp(ha->addr, addr, addr_len)) {
>> + if (--ha->refcount)
>> + return 0;
>> + list_del_rcu(&ha->list);
>> + synchronize_rcu();
>
>Oh well... I'm pretty sure this synchronize_rcu() call can be avoided,
>dont you think ? Check kfree_rcu() or equivalent, as it seems not yet
>included in current kernels...
>
Well once kfree_rcu() will be in the tree I will be happy to replace this.

>> + kfree(ha);
>> + return 0;
>> + }
>> + }
>> + return -ENOENT;

<snip>

>> + err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
>> + if (!err) {
>> + /*
>> + * Get the first (previously created) address from the list
>> + * and set dev_addr pointer to this location.
>> + */
>> + rcu_read_lock();
>
>locking is not correct or unnecessary

Agree that here locking is not necessary, but I wanted to stay consistent to the
rest of the code. Do you think I should remove locking here entirely?

>
>> + ha = list_first_entry_rcu(&dev->dev_addr_list,
>> + struct netdev_hw_addr, list);
>> + dev->dev_addr = ha->addr;
>> + rcu_read_unlock();
>> + }
>
>

2009-04-15 11:22:57

by Patrick McHardy

[permalink] [raw]
Subject: Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list

Jiri Pirko wrote:

>> Since you obviously need a write lock here to be sure following
>> can be done by one cpu only.
>>
>> You have same problem all over this patch.
>
> Yes, as Dave wrote, this is guarded by RTNL mutex.

This was incorrect. IPv6 adds multicast addresses in softirq context.

>>> +
>>> + ha = kzalloc(sizeof(*ha), GFP_ATOMIC);
>> kzalloc(max(sizeof(*ha), L1_CACHE_SIZE), GFP_...) is thus higly recommended here.
>>
>> Also, why GFP_ATOMIC is needed here ?
>
> Yes, it is not needed here. I've copied it here from the original unicast and
> multicast add funtion to stay close but as I can see, there is no need for it
> there either.
> Noted.

Also needed for IPv6 in softirq context.

2009-04-15 11:31:09

by Jiri Pirko

[permalink] [raw]
Subject: Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list

Wed, Apr 15, 2009 at 01:22:32PM CEST, [email protected] wrote:
> Jiri Pirko wrote:
>
>>> Since you obviously need a write lock here to be sure following
>>> can be done by one cpu only.
>>>
>>> You have same problem all over this patch.
>>
>> Yes, as Dave wrote, this is guarded by RTNL mutex.
>
> This was incorrect. IPv6 adds multicast addresses in softirq context.

Yes, I see that.
>
>>>> +
>>>> + ha = kzalloc(sizeof(*ha), GFP_ATOMIC);
>>> kzalloc(max(sizeof(*ha), L1_CACHE_SIZE), GFP_...) is thus higly recommended here.
>>>
>>> Also, why GFP_ATOMIC is needed here ?
>>
>> Yes, it is not needed here. I've copied it here from the original unicast and
>> multicast add funtion to stay close but as I can see, there is no need for it
>> there either.
>> Noted.
>
> Also needed for IPv6 in softirq context.
>

Noted...

2009-04-15 12:32:17

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list

Jiri Pirko a ?crit :
> Wed, Apr 15, 2009 at 11:27:50AM CEST, [email protected] wrote:

>> kzalloc(max(sizeof(*ha), L1_CACHE_SIZE)) is thus higly recommended here.
> You mean PAGE_CACHE_SIZE? I think that would be little wasting... But I see your
> point...

No, I meant L1_CACHE_BYTES (usually 64 bytes on x86), I always confuse BYTES and SIZE on this one...


>>> + list_for_each_entry(ha, list, list) {
>>> + if (i++ != ignore_index &&
>>> + !memcmp(ha->addr, addr, addr_len)) {
>>> + if (--ha->refcount)
>>> + return 0;
>>> + list_del_rcu(&ha->list);
>>> + synchronize_rcu();
>> Oh well... I'm pretty sure this synchronize_rcu() call can be avoided,
>> dont you think ? Check kfree_rcu() or equivalent, as it seems not yet
>> included in current kernels...
>>
> Well once kfree_rcu() will be in the tree I will be happy to replace this.

If kfree_rcu() not yet available, please use a regular call_rcu() construct
(thus adding a struct rcu_head rcu; in struct netdev_hw_addr)

If you delete say 10 addresses on a device, while RTNL (or other lock) locked,
that means a lot of calls to synchronize_rcu() and a long lock hold time.

>
>>> + kfree(ha);
>>> + return 0;
>>> + }
>>> + }
>>> + return -ENOENT;
>
> <snip>
>
>>> + err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
>>> + if (!err) {
>>> + /*
>>> + * Get the first (previously created) address from the list
>>> + * and set dev_addr pointer to this location.
>>> + */
>>> + rcu_read_lock();
>> locking is not correct or unnecessary
>
> Agree that here locking is not necessary, but I wanted to stay consistent to the
> rest of the code. Do you think I should remove locking here entirely?

Yes, it is very confusing for reviewers because we feel patch submiter
is not comfortable with locking rules.

Check for example dev_add_pack() in net/core/dev.c : It uses list_add_rcu()
but as it also uses a regular spinlock, there is no point using rcu_read_lock().

void dev_add_pack(struct packet_type *pt)
{
int hash;

spin_lock_bh(&ptype_lock);
if (pt->type == htons(ETH_P_ALL))
list_add_rcu(&pt->list, &ptype_all);
else {
hash = ntohs(pt->type) & PTYPE_HASH_MASK;
list_add_rcu(&pt->list, &ptype_base[hash]);
}
spin_unlock_bh(&ptype_lock);
}



Please note list_add_rcu() (and/or rcu_assign_pointer()) are still needed to protect
readers that dont use the spinlock at all.

If you use fact that RTNL is locked when calling your code, you could add
ASSERT_RTNL();
at strategic points so that this assertion can be checked at runtime.

(but Patrick & David wrote that you should not assume RTNL, so you probably need another lock...)

Thank you

2009-04-15 14:45:06

by Jiri Pirko

[permalink] [raw]
Subject: Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list

Wed, Apr 15, 2009 at 12:13:57PM CEST, [email protected] wrote:
> David Miller wrote:
>> From: Eric Dumazet <[email protected]>
>> Date: Wed, 15 Apr 2009 11:27:50 +0200
>>
>>> Since you obviously need a write lock here to be sure following
>>> can be done by one cpu only.
>>>
>>> You have same problem all over this patch.
>>
>> RTNL semaphore is held across all modification operations.
>
> If this will also be used for multicast lists, changes can happen
> (IPv6) without the RTNL.

Ok, but for dev_addr_X() functions the RTNL mutex is sufficient so I see no
point of adding another lock here. When the multicast handling functions will be
implemented to use netdev_hw_addr and it's layer, then we need to use update
lock in dev_multicast_X.

2009-04-15 18:04:20

by Jiri Pirko

[permalink] [raw]
Subject: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v2)

changes against last patch version:
-added forgotten ASSERT_RTNL to dev_addr_init and dev_addr_flush
-removed unnecessary rcu_read locking in dev_addr_init
-use compare_ether_addr_64bits instead of compare_ether_addr
-use L1_CACHE_BYTES as size for allocating struct netdev_hw_addr
-use call_rcu instead of rcu_synchronize
-moved is_etherdev_addr into __KERNEL__ ifdef

This patch introduces a new list in struct net_device and brings a set of
functions to handle the work with device address list. The list is a replacement
for the original dev_addr field and because in some situations there is need to
carry several device addresses with the net device. To be backward compatible,
dev_addr is made to point to the first member of the list so original drivers
sees no difference.

Signed-off-by: Jiri Pirko <[email protected]>
---
include/linux/etherdevice.h | 27 +++++
include/linux/netdevice.h | 32 +++++-
net/core/dev.c | 271 +++++++++++++++++++++++++++++++++++++++++++
3 files changed, 328 insertions(+), 2 deletions(-)

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index a1f17ab..3d7a668 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -182,6 +182,33 @@ static inline unsigned compare_ether_addr_64bits(const u8 addr1[6+2],
return compare_ether_addr(addr1, addr2);
#endif
}
+
+/**
+ * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
+ * @dev: Pointer to a device structure
+ * @addr: Pointer to a six-byte array containing the Ethernet address
+ *
+ * Compare passed address with all addresses of the device. Return true if the
+ * address if one of the device addresses.
+ *
+ * Note that this function calls compare_ether_addr_64bits() so take care of
+ * the right padding.
+ */
+static inline bool is_etherdev_addr(const struct net_device *dev,
+ const u8 addr[6 + 2])
+{
+ struct netdev_hw_addr *ha;
+ int res = 1;
+
+ rcu_read_lock();
+ for_each_dev_addr(dev, ha) {
+ res = compare_ether_addr_64bits(addr, ha->addr);
+ if (!res)
+ break;
+ }
+ rcu_read_unlock();
+ return !res;
+}
#endif /* __KERNEL__ */

/**
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2e7783f..89ad6d2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -210,6 +210,13 @@ struct dev_addr_list
#define dmi_users da_users
#define dmi_gusers da_gusers

+struct netdev_hw_addr {
+ struct list_head list;
+ unsigned char addr[MAX_ADDR_LEN];
+ int refcount;
+ struct rcu_head rcu_head;
+};
+
struct hh_cache
{
struct hh_cache *hh_next; /* Next entry */
@@ -776,8 +783,11 @@ struct net_device
*/
unsigned long last_rx; /* Time of last Rx */
/* Interface address info used in eth_type_trans() */
- unsigned char dev_addr[MAX_ADDR_LEN]; /* hw address, (before bcast
- because most packets are unicast) */
+ unsigned char *dev_addr; /* hw address, (before bcast
+ because most packets are
+ unicast) */
+
+ struct list_head dev_addr_list; /* list of device hw addresses */

unsigned char broadcast[MAX_ADDR_LEN]; /* hw bcast add */

@@ -1778,6 +1788,13 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
spin_unlock_bh(&dev->addr_list_lock);
}

+/*
+ * dev_addr_list walker. Should be used only for read access. Call with
+ * rcu_read_lock held.
+ */
+#define for_each_dev_addr(dev, ha) \
+ list_for_each_entry_rcu(ha, &dev->dev_addr_list, list)
+
/* These functions live elsewhere (drivers/net/net_init.c, but related) */

extern void ether_setup(struct net_device *dev);
@@ -1790,6 +1807,17 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
alloc_netdev_mq(sizeof_priv, name, setup, 1)
extern int register_netdev(struct net_device *dev);
extern void unregister_netdev(struct net_device *dev);
+
+/* Functions used for device addresses handling */
+extern int dev_addr_add(struct net_device *dev,
+ unsigned char *addr);
+extern int dev_addr_del(struct net_device *dev,
+ unsigned char *addr);
+extern int dev_addr_add_multiple(struct net_device *to_dev,
+ struct net_device *from_dev);
+extern int dev_addr_del_multiple(struct net_device *to_dev,
+ struct net_device *from_dev);
+
/* Functions used for secondary unicast and multicast support */
extern void dev_set_rx_mode(struct net_device *dev);
extern void __dev_set_rx_mode(struct net_device *dev);
diff --git a/net/core/dev.c b/net/core/dev.c
index 91d792d..961be4f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3437,6 +3437,270 @@ void dev_set_rx_mode(struct net_device *dev)
netif_addr_unlock_bh(dev);
}

+/* hw addresses list handling functions */
+
+static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
+ int addr_len, int ignore_index)
+{
+ struct netdev_hw_addr *ha;
+ int i = 0;
+
+ if (addr_len > MAX_ADDR_LEN)
+ return -EINVAL;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ha, list, list) {
+ if (i++ != ignore_index &&
+ !memcmp(ha->addr, addr, addr_len)) {
+ ha->refcount++;
+ rcu_read_unlock();
+ return 0;
+ }
+ }
+ rcu_read_unlock();
+
+ ha = kzalloc(max(sizeof(*ha), L1_CACHE_BYTES), GFP_ATOMIC);
+ if (!ha)
+ return -ENOMEM;
+ memcpy(ha->addr, addr, addr_len);
+ ha->refcount = 1;
+ list_add_tail_rcu(&ha->list, list);
+ return 0;
+}
+
+static int __hw_addr_add(struct list_head *list, unsigned char *addr,
+ int addr_len)
+{
+ return __hw_addr_add_ii(list, addr, addr_len, -1);
+}
+
+static void ha_rcu_free(struct rcu_head *head)
+{
+ struct netdev_hw_addr *ha;
+
+ ha = container_of(head, struct netdev_hw_addr, rcu_head);
+ kfree(ha);
+}
+
+static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
+ int addr_len, int ignore_index)
+{
+ struct netdev_hw_addr *ha;
+ int i = 0;
+
+ list_for_each_entry(ha, list, list) {
+ if (i++ != ignore_index &&
+ !memcmp(ha->addr, addr, addr_len)) {
+ if (--ha->refcount)
+ return 0;
+ list_del_rcu(&ha->list);
+ call_rcu(&ha->rcu_head, ha_rcu_free);
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+static int __hw_addr_del(struct list_head *list, unsigned char *addr,
+ int addr_len)
+{
+ return __hw_addr_del_ii(list, addr, addr_len, -1);
+}
+
+static int __hw_addr_add_multiple_ii(struct list_head *to_list,
+ struct list_head *from_list,
+ int addr_len, int ignore_index)
+{
+ int err = 0;
+ struct netdev_hw_addr *ha, *ha2;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ha, from_list, list) {
+ err = __hw_addr_add_ii(to_list, ha->addr, addr_len, 0);
+ if (err)
+ goto unroll;
+ }
+ goto unlock;
+unroll:
+ list_for_each_entry_rcu(ha2, from_list, list) {
+ if (ha2 == ha)
+ break;
+ __hw_addr_del_ii(to_list, ha2->addr, addr_len, 0);
+ }
+unlock:
+ rcu_read_unlock();
+ return err;
+}
+
+static int __hw_addr_add_multiple(struct list_head *to_list,
+ struct list_head *from_list,
+ int addr_len)
+{
+ return __hw_addr_add_multiple_ii(to_list, from_list, addr_len, -1);
+}
+
+static void __hw_addr_del_multiple_ii(struct list_head *to_list,
+ struct list_head *from_list,
+ int addr_len, int ignore_index)
+{
+ struct netdev_hw_addr *ha;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ha, from_list, list) {
+ __hw_addr_del_ii(to_list, ha->addr, addr_len, 0);
+ }
+ rcu_read_unlock();
+}
+
+static void __hw_addr_del_multiple(struct list_head *to_list,
+ struct list_head *from_list,
+ int addr_len)
+{
+ __hw_addr_del_multiple_ii(to_list, from_list, addr_len, -1);
+}
+
+static void __hw_addr_flush(struct list_head *list)
+{
+ struct netdev_hw_addr *ha, *tmp;
+
+ list_for_each_entry_safe(ha, tmp, list, list) {
+ list_del_rcu(&ha->list);
+ call_rcu(&ha->rcu_head, ha_rcu_free);
+ }
+}
+
+/* Device addresses handling functions */
+
+static void dev_addr_flush(struct net_device *dev)
+{
+ ASSERT_RTNL();
+
+ __hw_addr_flush(&dev->dev_addr_list);
+ dev->dev_addr = NULL;
+}
+
+static int dev_addr_init(struct net_device *dev)
+{
+ unsigned char addr[MAX_ADDR_LEN];
+ struct netdev_hw_addr *ha;
+ int err;
+
+ ASSERT_RTNL();
+
+ INIT_LIST_HEAD(&dev->dev_addr_list);
+ memset(addr, 0, sizeof(*addr));
+ err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
+ if (!err) {
+ /*
+ * Get the first (previously created) address from the list
+ * and set dev_addr pointer to this location.
+ */
+ ha = list_first_entry(&dev->dev_addr_list,
+ struct netdev_hw_addr, list);
+ dev->dev_addr = ha->addr;
+ }
+ return err;
+}
+
+/**
+ * dev_addr_add - Add a device address
+ * @dev: device
+ * @addr: address to add
+ *
+ * Add a device address to the device or increase the reference count if
+ * it already exists.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add(struct net_device *dev, unsigned char *addr)
+{
+ int err;
+
+ ASSERT_RTNL();
+
+ err = __hw_addr_add_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
+ if (!err)
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_addr_add);
+
+/**
+ * dev_addr_del - Release a device address.
+ * @dev: device
+ * @addr: address to delete
+ *
+ * Release reference to a device address and remove it from the device
+ * if the reference count drops to zero.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del(struct net_device *dev, unsigned char *addr)
+{
+ int err;
+
+ ASSERT_RTNL();
+
+ err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
+ if (!err)
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_addr_del);
+
+/**
+ * dev_addr_add_multiple - Add device addresses from another device
+ * @to_dev: device to which addresses will be added
+ * @from_dev: device from which addresses will be added
+ *
+ * Add device addresses of the one device to another.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add_multiple(struct net_device *to_dev,
+ struct net_device *from_dev)
+{
+ int err;
+
+ ASSERT_RTNL();
+
+ if (from_dev->addr_len != to_dev->addr_len)
+ return -EINVAL;
+ err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+ &from_dev->dev_addr_list,
+ to_dev->addr_len, 0);
+ if (!err)
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_addr_add_multiple);
+
+/**
+ * dev_addr_del_multiple - Delete device addresses by another device
+ * @to_dev: device where the addresses will be deleted
+ * @from_dev: device by which addresses the addresses will be deleted
+ *
+ * Deletes addresses in to device by the list of addresses in from device.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del_multiple(struct net_device *to_dev,
+ struct net_device *from_dev)
+{
+ ASSERT_RTNL();
+
+ if (from_dev->addr_len != to_dev->addr_len)
+ return -EINVAL;
+ __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+ &from_dev->dev_addr_list,
+ to_dev->addr_len, 0);
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+ return 0;
+}
+EXPORT_SYMBOL(dev_addr_del_multiple);
+
+/* unicast and multicast addresses handling functions */
+
int __dev_addr_delete(struct dev_addr_list **list, int *count,
void *addr, int alen, int glbl)
{
@@ -4257,6 +4521,9 @@ static void rollback_registered(struct net_device *dev)
*/
dev_addr_discard(dev);

+ /* Flush device addresses */
+ dev_addr_flush(dev);
+
if (dev->netdev_ops->ndo_uninit)
dev->netdev_ops->ndo_uninit(dev);

@@ -4779,6 +5046,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,

dev->gso_max_size = GSO_MAX_SIZE;

+ dev_addr_init(dev);
netdev_init_queues(dev);

INIT_LIST_HEAD(&dev->napi_list);
@@ -4965,6 +5233,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
*/
dev_addr_discard(dev);

+ /* Flush device addresses */
+ dev_addr_flush(dev);
+
netdev_unregister_kobject(dev);

/* Actually switch the network namespace */
--
1.6.0.6

2009-04-15 18:58:30

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v2)

Jiri Pirko a ?crit :
> changes against last patch version:
> -added forgotten ASSERT_RTNL to dev_addr_init and dev_addr_flush
> -removed unnecessary rcu_read locking in dev_addr_init
> -use compare_ether_addr_64bits instead of compare_ether_addr
> -use L1_CACHE_BYTES as size for allocating struct netdev_hw_addr
> -use call_rcu instead of rcu_synchronize
> -moved is_etherdev_addr into __KERNEL__ ifdef
>
> This patch introduces a new list in struct net_device and brings a set of
> functions to handle the work with device address list. The list is a replacement
> for the original dev_addr field and because in some situations there is need to
> carry several device addresses with the net device. To be backward compatible,
> dev_addr is made to point to the first member of the list so original drivers
> sees no difference.
>
> Signed-off-by: Jiri Pirko <[email protected]>
> ---
> include/linux/etherdevice.h | 27 +++++
> include/linux/netdevice.h | 32 +++++-
> net/core/dev.c | 271 +++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 328 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
> index a1f17ab..3d7a668 100644
> --- a/include/linux/etherdevice.h
> +++ b/include/linux/etherdevice.h
> @@ -182,6 +182,33 @@ static inline unsigned compare_ether_addr_64bits(const u8 addr1[6+2],
> return compare_ether_addr(addr1, addr2);
> #endif
> }
> +
> +/**
> + * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
> + * @dev: Pointer to a device structure
> + * @addr: Pointer to a six-byte array containing the Ethernet address
> + *
> + * Compare passed address with all addresses of the device. Return true if the
> + * address if one of the device addresses.
> + *
> + * Note that this function calls compare_ether_addr_64bits() so take care of
> + * the right padding.
> + */
> +static inline bool is_etherdev_addr(const struct net_device *dev,
> + const u8 addr[6 + 2])
> +{
> + struct netdev_hw_addr *ha;
> + int res = 1;
> +
> + rcu_read_lock();
> + for_each_dev_addr(dev, ha) {
> + res = compare_ether_addr_64bits(addr, ha->addr);
> + if (!res)
> + break;
> + }
> + rcu_read_unlock();
> + return !res;
> +}
> #endif /* __KERNEL__ */
>
> /**
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 2e7783f..89ad6d2 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -210,6 +210,13 @@ struct dev_addr_list
> #define dmi_users da_users
> #define dmi_gusers da_gusers
>
> +struct netdev_hw_addr {
> + struct list_head list;
> + unsigned char addr[MAX_ADDR_LEN];
> + int refcount;
> + struct rcu_head rcu_head;
> +};
> +
> struct hh_cache
> {
> struct hh_cache *hh_next; /* Next entry */
> @@ -776,8 +783,11 @@ struct net_device
> */
> unsigned long last_rx; /* Time of last Rx */
> /* Interface address info used in eth_type_trans() */
> - unsigned char dev_addr[MAX_ADDR_LEN]; /* hw address, (before bcast
> - because most packets are unicast) */
> + unsigned char *dev_addr; /* hw address, (before bcast
> + because most packets are
> + unicast) */
> +
> + struct list_head dev_addr_list; /* list of device hw addresses */
>
> unsigned char broadcast[MAX_ADDR_LEN]; /* hw bcast add */
>
> @@ -1778,6 +1788,13 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
> spin_unlock_bh(&dev->addr_list_lock);
> }
>
> +/*
> + * dev_addr_list walker. Should be used only for read access. Call with
> + * rcu_read_lock held.
> + */
> +#define for_each_dev_addr(dev, ha) \
> + list_for_each_entry_rcu(ha, &dev->dev_addr_list, list)
> +
> /* These functions live elsewhere (drivers/net/net_init.c, but related) */
>
> extern void ether_setup(struct net_device *dev);
> @@ -1790,6 +1807,17 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
> alloc_netdev_mq(sizeof_priv, name, setup, 1)
> extern int register_netdev(struct net_device *dev);
> extern void unregister_netdev(struct net_device *dev);
> +
> +/* Functions used for device addresses handling */
> +extern int dev_addr_add(struct net_device *dev,
> + unsigned char *addr);
> +extern int dev_addr_del(struct net_device *dev,
> + unsigned char *addr);
> +extern int dev_addr_add_multiple(struct net_device *to_dev,
> + struct net_device *from_dev);
> +extern int dev_addr_del_multiple(struct net_device *to_dev,
> + struct net_device *from_dev);
> +
> /* Functions used for secondary unicast and multicast support */
> extern void dev_set_rx_mode(struct net_device *dev);
> extern void __dev_set_rx_mode(struct net_device *dev);
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 91d792d..961be4f 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3437,6 +3437,270 @@ void dev_set_rx_mode(struct net_device *dev)
> netif_addr_unlock_bh(dev);
> }
>
> +/* hw addresses list handling functions */
> +
> +static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
> + int addr_len, int ignore_index)
> +{
> + struct netdev_hw_addr *ha;
> + int i = 0;
> +
> + if (addr_len > MAX_ADDR_LEN)
> + return -EINVAL;
> +

Please put here the ASSERT_RTNL(), not in various callers, since
this is the place where we really assume rtnl lock is locked by us.

You still use rcu_read_lock()/unlock() and rcu variant here...

But caller of this function has RTNL (or other lock) so dont use rcu here, as it seems
inconsistent with kzalloc() code that comes next.

> + rcu_read_lock();
> + list_for_each_entry_rcu(ha, list, list) {
> + if (i++ != ignore_index &&
> + !memcmp(ha->addr, addr, addr_len)) {
> + ha->refcount++;
> + rcu_read_unlock();
> + return 0;
> + }
> + }
> + rcu_read_unlock();
> +
> + ha = kzalloc(max(sizeof(*ha), L1_CACHE_BYTES), GFP_ATOMIC);
> + if (!ha)
> + return -ENOMEM;
> + memcpy(ha->addr, addr, addr_len);
> + ha->refcount = 1;
> + list_add_tail_rcu(&ha->list, list);
> + return 0;
> +}
> +
> +static int __hw_addr_add(struct list_head *list, unsigned char *addr,
> + int addr_len)
> +{
> + return __hw_addr_add_ii(list, addr, addr_len, -1);
> +}
> +
> +static void ha_rcu_free(struct rcu_head *head)
> +{
> + struct netdev_hw_addr *ha;
> +
> + ha = container_of(head, struct netdev_hw_addr, rcu_head);
> + kfree(ha);
> +}
> +
> +static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
> + int addr_len, int ignore_index)
> +{
> + struct netdev_hw_addr *ha;
> + int i = 0;
> +

ASSERT_RTNL() here, not in callers.

> + list_for_each_entry(ha, list, list) {
> + if (i++ != ignore_index &&
> + !memcmp(ha->addr, addr, addr_len)) {
> + if (--ha->refcount)
> + return 0;
> + list_del_rcu(&ha->list);
> + call_rcu(&ha->rcu_head, ha_rcu_free);
> + return 0;
> + }
> + }
> + return -ENOENT;
> +}
> +
> +static int __hw_addr_del(struct list_head *list, unsigned char *addr,
> + int addr_len)
> +{
> + return __hw_addr_del_ii(list, addr, addr_len, -1);
> +}
> +
> +static int __hw_addr_add_multiple_ii(struct list_head *to_list,
> + struct list_head *from_list,
> + int addr_len, int ignore_index)
> +{
> + int err = 0;
> + struct netdev_hw_addr *ha, *ha2;
> +

same here, no need for rcu_read_lock(), since you are going to change list, you
have RTNL lock or equivalent.

> + rcu_read_lock();
> + list_for_each_entry_rcu(ha, from_list, list) {
> + err = __hw_addr_add_ii(to_list, ha->addr, addr_len, 0);
> + if (err)
> + goto unroll;
> + }
> + goto unlock;
> +unroll:
> + list_for_each_entry_rcu(ha2, from_list, list) {
> + if (ha2 == ha)
> + break;
> + __hw_addr_del_ii(to_list, ha2->addr, addr_len, 0);
> + }
> +unlock:
> + rcu_read_unlock();
> + return err;
> +}
> +
> +static int __hw_addr_add_multiple(struct list_head *to_list,
> + struct list_head *from_list,
> + int addr_len)
> +{
> + return __hw_addr_add_multiple_ii(to_list, from_list, addr_len, -1);
> +}
> +
> +static void __hw_addr_del_multiple_ii(struct list_head *to_list,
> + struct list_head *from_list,
> + int addr_len, int ignore_index)
> +{
> + struct netdev_hw_addr *ha;
> +

same here, no rcu_read_lock() needed...

> + rcu_read_lock();
> + list_for_each_entry_rcu(ha, from_list, list) {
> + __hw_addr_del_ii(to_list, ha->addr, addr_len, 0);
> + }
> + rcu_read_unlock();
> +}
> +
> +static void __hw_addr_del_multiple(struct list_head *to_list,
> + struct list_head *from_list,
> + int addr_len)
> +{
> + __hw_addr_del_multiple_ii(to_list, from_list, addr_len, -1);
> +}
> +
> +static void __hw_addr_flush(struct list_head *list)
> +{
> + struct netdev_hw_addr *ha, *tmp;
> +

ASSERT_RTNL();

> + list_for_each_entry_safe(ha, tmp, list, list) {
> + list_del_rcu(&ha->list);
> + call_rcu(&ha->rcu_head, ha_rcu_free);
> + }
> +}
> +
> +/* Device addresses handling functions */
> +
> +static void dev_addr_flush(struct net_device *dev)
> +{
> + ASSERT_RTNL();
> +
> + __hw_addr_flush(&dev->dev_addr_list);
> + dev->dev_addr = NULL;

seems risky here to set this to NULL... You could use a static var to avoid
further NULL dereference.

static char nulladdr[MAX_ADDR_LEN];
dev->dev_addr = nulladdr;

> +}
> +
> +static int dev_addr_init(struct net_device *dev)
> +{
> + unsigned char addr[MAX_ADDR_LEN];
> + struct netdev_hw_addr *ha;
> + int err;
> +
> + ASSERT_RTNL();
> +
> + INIT_LIST_HEAD(&dev->dev_addr_list);
> + memset(addr, 0, sizeof(*addr));
> + err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
> + if (!err) {
> + /*
> + * Get the first (previously created) address from the list
> + * and set dev_addr pointer to this location.
> + */
> + ha = list_first_entry(&dev->dev_addr_list,
> + struct netdev_hw_addr, list);
> + dev->dev_addr = ha->addr;
> + }
> + return err;
> +}
> +
> +/**
> + * dev_addr_add - Add a device address
> + * @dev: device
> + * @addr: address to add
> + *
> + * Add a device address to the device or increase the reference count if
> + * it already exists.
> + *
> + * The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_add(struct net_device *dev, unsigned char *addr)
> +{
> + int err;
> +
> + ASSERT_RTNL();
> +
> + err = __hw_addr_add_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
> + if (!err)
> + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
> + return err;
> +}
> +EXPORT_SYMBOL(dev_addr_add);
> +
> +/**
> + * dev_addr_del - Release a device address.
> + * @dev: device
> + * @addr: address to delete
> + *
> + * Release reference to a device address and remove it from the device
> + * if the reference count drops to zero.
> + *
> + * The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_del(struct net_device *dev, unsigned char *addr)
> +{
> + int err;
> +
> + ASSERT_RTNL();
> +
> + err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
> + if (!err)
> + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
> + return err;
> +}
> +EXPORT_SYMBOL(dev_addr_del);
> +
> +/**
> + * dev_addr_add_multiple - Add device addresses from another device
> + * @to_dev: device to which addresses will be added
> + * @from_dev: device from which addresses will be added
> + *
> + * Add device addresses of the one device to another.
> + *
> + * The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_add_multiple(struct net_device *to_dev,
> + struct net_device *from_dev)
> +{
> + int err;
> +
> + ASSERT_RTNL();
> +
> + if (from_dev->addr_len != to_dev->addr_len)
> + return -EINVAL;
> + err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
> + &from_dev->dev_addr_list,
> + to_dev->addr_len, 0);
> + if (!err)
> + call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
> + return err;
> +}
> +EXPORT_SYMBOL(dev_addr_add_multiple);
> +
> +/**
> + * dev_addr_del_multiple - Delete device addresses by another device
> + * @to_dev: device where the addresses will be deleted
> + * @from_dev: device by which addresses the addresses will be deleted
> + *
> + * Deletes addresses in to device by the list of addresses in from device.
> + *
> + * The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_del_multiple(struct net_device *to_dev,
> + struct net_device *from_dev)
> +{
> + ASSERT_RTNL();
> +
> + if (from_dev->addr_len != to_dev->addr_len)
> + return -EINVAL;
> + __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
> + &from_dev->dev_addr_list,
> + to_dev->addr_len, 0);
> + call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
> + return 0;
> +}
> +EXPORT_SYMBOL(dev_addr_del_multiple);
> +
> +/* unicast and multicast addresses handling functions */
> +
> int __dev_addr_delete(struct dev_addr_list **list, int *count,
> void *addr, int alen, int glbl)
> {
> @@ -4257,6 +4521,9 @@ static void rollback_registered(struct net_device *dev)
> */
> dev_addr_discard(dev);
>
> + /* Flush device addresses */
> + dev_addr_flush(dev);
> +

Are you sure that no driver in tree will dereference dev->dev_addr after this point ?

> if (dev->netdev_ops->ndo_uninit)
> dev->netdev_ops->ndo_uninit(dev);
>
> @@ -4779,6 +5046,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
>
> dev->gso_max_size = GSO_MAX_SIZE;
>
> + dev_addr_init(dev);
> netdev_init_queues(dev);
>
> INIT_LIST_HEAD(&dev->napi_list);
> @@ -4965,6 +5233,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
> */
> dev_addr_discard(dev);
>
> + /* Flush device addresses */
> + dev_addr_flush(dev);
> +
> netdev_unregister_kobject(dev);
>
> /* Actually switch the network namespace */

2009-04-16 08:48:04

by Jiri Pirko

[permalink] [raw]
Subject: Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v2)

Wed, Apr 15, 2009 at 08:54:05PM CEST, [email protected] wrote:
>Jiri Pirko a ?crit :

<snip>

>> +static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
>> + int addr_len, int ignore_index)
>> +{
>> + struct netdev_hw_addr *ha;
>> + int i = 0;
>> +
>> + if (addr_len > MAX_ADDR_LEN)
>> + return -EINVAL;
>> +
>
>Please put here the ASSERT_RTNL(), not in various callers, since
>this is the place where we really assume rtnl lock is locked by us.

Well I'd like to have ASSERT_RTNL in callers. The reason is that for this
purpose (dev_addr) the guarding lock is rtnl. But for example for multicast
addresses it won't be. It will be most probably a spin lock. But those callers
(multicast) will use this __hw_addr_xxx functions too. Therefore I'd like to
leave locking on current level.
>
>You still use rcu_read_lock()/unlock() and rcu variant here...

Yes this is unecessrary and confusing I agree. Will remove these read locks in
places where there is guarded by rtnl mutex.
>
>But caller of this function has RTNL (or other lock) so dont use rcu here, as it seems
>inconsistent with kzalloc() code that comes next.
>
>> + rcu_read_lock();
>> + list_for_each_entry_rcu(ha, list, list) {
>> + if (i++ != ignore_index &&
>> + !memcmp(ha->addr, addr, addr_len)) {
>> + ha->refcount++;
>> + rcu_read_unlock();
>> + return 0;
>> + }
>> + }
>> + rcu_read_unlock();
>> +
>> + ha = kzalloc(max(sizeof(*ha), L1_CACHE_BYTES), GFP_ATOMIC);
>> + if (!ha)
>> + return -ENOMEM;
>> + memcpy(ha->addr, addr, addr_len);
>> + ha->refcount = 1;
>> + list_add_tail_rcu(&ha->list, list);
>> + return 0;

<snip>

>> +static int __hw_addr_add_multiple_ii(struct list_head *to_list,
>> + struct list_head *from_list,
>> + int addr_len, int ignore_index)
>> +{
>> + int err = 0;
>> + struct netdev_hw_addr *ha, *ha2;
>> +
>
>same here, no need for rcu_read_lock(), since you are going to change list, you
>have RTNL lock or equivalent.
>
Yes, I wanted to show that for "from_list" this is a reader...
....unnecessary,foolish -> removing...
>> + rcu_read_lock();
>> + list_for_each_entry_rcu(ha, from_list, list) {
>> + err = __hw_addr_add_ii(to_list, ha->addr, addr_len, 0);
>> + if (err)
>> + goto unroll;
>> + }
>> + goto unlock;
>> +unroll:
>> + list_for_each_entry_rcu(ha2, from_list, list) {
>> + if (ha2 == ha)
>> + break;
>> + __hw_addr_del_ii(to_list, ha2->addr, addr_len, 0);
>> + }
>> +unlock:
>> + rcu_read_unlock();
>> + return err;
>> +}
>> +

<snip>

>> +static void dev_addr_flush(struct net_device *dev)
>> +{
>> + ASSERT_RTNL();
>> +
>> + __hw_addr_flush(&dev->dev_addr_list);
>> + dev->dev_addr = NULL;
>
>seems risky here to set this to NULL... You could use a static var to avoid
>further NULL dereference.
>
>static char nulladdr[MAX_ADDR_LEN];
>dev->dev_addr = nulladdr;
>
>> +}
>> +

<snip>

>> @@ -4257,6 +4521,9 @@ static void rollback_registered(struct net_device *dev)
>> */
>> dev_addr_discard(dev);
>>
>> + /* Flush device addresses */
>> + dev_addr_flush(dev);
>> +
>
>Are you sure that no driver in tree will dereference dev->dev_addr after this point ?

I assume that driver might not use dev_addr after it calls
unregister_netdevice(). But ok - I would rather move calling dev_addr_flush()
somewhere later where there is a guarantee that dev_addr should not be
referenced. Perhaps in free_netdev() ? It would also correspond with calling
dev_addr_init() in alloc_netdev_mq()...
>
>> if (dev->netdev_ops->ndo_uninit)
>> dev->netdev_ops->ndo_uninit(dev);
>>
>> @@ -4779,6 +5046,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
>>
>> dev->gso_max_size = GSO_MAX_SIZE;
>>
>> + dev_addr_init(dev);
>> netdev_init_queues(dev);
>>
>> INIT_LIST_HEAD(&dev->napi_list);
>> @@ -4965,6 +5233,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
>> */
>> dev_addr_discard(dev);
>>
>> + /* Flush device addresses */
>> + dev_addr_flush(dev);
>> +
>> netdev_unregister_kobject(dev);
>>
>> /* Actually switch the network namespace */
>
>

2009-04-17 12:00:30

by Jiri Pirko

[permalink] [raw]
Subject: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v3)

v2 -> v3 (current):
-removed unnecessary rcu read locking
-moved dev_addr_flush() calling to ensure no null dereference of dev_addr

v1 -> v2:
-added forgotten ASSERT_RTNL to dev_addr_init and dev_addr_flush
-removed unnecessary rcu_read locking in dev_addr_init
-use compare_ether_addr_64bits instead of compare_ether_addr
-use L1_CACHE_BYTES as size for allocating struct netdev_hw_addr
-use call_rcu instead of rcu_synchronize
-moved is_etherdev_addr into __KERNEL__ ifdef

This patch introduces a new list in struct net_device and brings a set of
functions to handle the work with device address list. The list is a replacement
for the original dev_addr field and because in some situations there is need to
carry several device addresses with the net device. To be backward compatible,
dev_addr is made to point to the first member of the list so original drivers
sees no difference.

Signed-off-by: Jiri Pirko <[email protected]>
---
include/linux/etherdevice.h | 27 +++++
include/linux/netdevice.h | 32 +++++-
net/core/dev.c | 261 +++++++++++++++++++++++++++++++++++++++++++
3 files changed, 318 insertions(+), 2 deletions(-)

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index a1f17ab..3d7a668 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -182,6 +182,33 @@ static inline unsigned compare_ether_addr_64bits(const u8 addr1[6+2],
return compare_ether_addr(addr1, addr2);
#endif
}
+
+/**
+ * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
+ * @dev: Pointer to a device structure
+ * @addr: Pointer to a six-byte array containing the Ethernet address
+ *
+ * Compare passed address with all addresses of the device. Return true if the
+ * address if one of the device addresses.
+ *
+ * Note that this function calls compare_ether_addr_64bits() so take care of
+ * the right padding.
+ */
+static inline bool is_etherdev_addr(const struct net_device *dev,
+ const u8 addr[6 + 2])
+{
+ struct netdev_hw_addr *ha;
+ int res = 1;
+
+ rcu_read_lock();
+ for_each_dev_addr(dev, ha) {
+ res = compare_ether_addr_64bits(addr, ha->addr);
+ if (!res)
+ break;
+ }
+ rcu_read_unlock();
+ return !res;
+}
#endif /* __KERNEL__ */

/**
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2e7783f..89ad6d2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -210,6 +210,13 @@ struct dev_addr_list
#define dmi_users da_users
#define dmi_gusers da_gusers

+struct netdev_hw_addr {
+ struct list_head list;
+ unsigned char addr[MAX_ADDR_LEN];
+ int refcount;
+ struct rcu_head rcu_head;
+};
+
struct hh_cache
{
struct hh_cache *hh_next; /* Next entry */
@@ -776,8 +783,11 @@ struct net_device
*/
unsigned long last_rx; /* Time of last Rx */
/* Interface address info used in eth_type_trans() */
- unsigned char dev_addr[MAX_ADDR_LEN]; /* hw address, (before bcast
- because most packets are unicast) */
+ unsigned char *dev_addr; /* hw address, (before bcast
+ because most packets are
+ unicast) */
+
+ struct list_head dev_addr_list; /* list of device hw addresses */

unsigned char broadcast[MAX_ADDR_LEN]; /* hw bcast add */

@@ -1778,6 +1788,13 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
spin_unlock_bh(&dev->addr_list_lock);
}

+/*
+ * dev_addr_list walker. Should be used only for read access. Call with
+ * rcu_read_lock held.
+ */
+#define for_each_dev_addr(dev, ha) \
+ list_for_each_entry_rcu(ha, &dev->dev_addr_list, list)
+
/* These functions live elsewhere (drivers/net/net_init.c, but related) */

extern void ether_setup(struct net_device *dev);
@@ -1790,6 +1807,17 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
alloc_netdev_mq(sizeof_priv, name, setup, 1)
extern int register_netdev(struct net_device *dev);
extern void unregister_netdev(struct net_device *dev);
+
+/* Functions used for device addresses handling */
+extern int dev_addr_add(struct net_device *dev,
+ unsigned char *addr);
+extern int dev_addr_del(struct net_device *dev,
+ unsigned char *addr);
+extern int dev_addr_add_multiple(struct net_device *to_dev,
+ struct net_device *from_dev);
+extern int dev_addr_del_multiple(struct net_device *to_dev,
+ struct net_device *from_dev);
+
/* Functions used for secondary unicast and multicast support */
extern void dev_set_rx_mode(struct net_device *dev);
extern void __dev_set_rx_mode(struct net_device *dev);
diff --git a/net/core/dev.c b/net/core/dev.c
index 343883f..b4503ac 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3438,6 +3438,263 @@ void dev_set_rx_mode(struct net_device *dev)
netif_addr_unlock_bh(dev);
}

+/* hw addresses list handling functions */
+
+static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
+ int addr_len, int ignore_index)
+{
+ struct netdev_hw_addr *ha;
+ int i = 0;
+
+ if (addr_len > MAX_ADDR_LEN)
+ return -EINVAL;
+
+ list_for_each_entry(ha, list, list) {
+ if (i++ != ignore_index &&
+ !memcmp(ha->addr, addr, addr_len)) {
+ ha->refcount++;
+ return 0;
+ }
+ }
+
+ ha = kzalloc(max(sizeof(*ha), L1_CACHE_BYTES), GFP_ATOMIC);
+ if (!ha)
+ return -ENOMEM;
+ memcpy(ha->addr, addr, addr_len);
+ ha->refcount = 1;
+ list_add_tail_rcu(&ha->list, list);
+ return 0;
+}
+
+static int __hw_addr_add(struct list_head *list, unsigned char *addr,
+ int addr_len)
+{
+ return __hw_addr_add_ii(list, addr, addr_len, -1);
+}
+
+static void ha_rcu_free(struct rcu_head *head)
+{
+ struct netdev_hw_addr *ha;
+
+ ha = container_of(head, struct netdev_hw_addr, rcu_head);
+ kfree(ha);
+}
+
+static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
+ int addr_len, int ignore_index)
+{
+ struct netdev_hw_addr *ha;
+ int i = 0;
+
+ list_for_each_entry(ha, list, list) {
+ if (i++ != ignore_index &&
+ !memcmp(ha->addr, addr, addr_len)) {
+ if (--ha->refcount)
+ return 0;
+ list_del_rcu(&ha->list);
+ call_rcu(&ha->rcu_head, ha_rcu_free);
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+static int __hw_addr_del(struct list_head *list, unsigned char *addr,
+ int addr_len)
+{
+ return __hw_addr_del_ii(list, addr, addr_len, -1);
+}
+
+static int __hw_addr_add_multiple_ii(struct list_head *to_list,
+ struct list_head *from_list,
+ int addr_len, int ignore_index)
+{
+ int err;
+ struct netdev_hw_addr *ha, *ha2;
+
+ list_for_each_entry(ha, from_list, list) {
+ err = __hw_addr_add_ii(to_list, ha->addr, addr_len, 0);
+ if (err)
+ goto unroll;
+ }
+ return 0;
+
+unroll:
+ list_for_each_entry(ha2, from_list, list) {
+ if (ha2 == ha)
+ break;
+ __hw_addr_del_ii(to_list, ha2->addr, addr_len, 0);
+ }
+ return err;
+}
+
+static int __hw_addr_add_multiple(struct list_head *to_list,
+ struct list_head *from_list,
+ int addr_len)
+{
+ return __hw_addr_add_multiple_ii(to_list, from_list, addr_len, -1);
+}
+
+static void __hw_addr_del_multiple_ii(struct list_head *to_list,
+ struct list_head *from_list,
+ int addr_len, int ignore_index)
+{
+ struct netdev_hw_addr *ha;
+
+ list_for_each_entry(ha, from_list, list) {
+ __hw_addr_del_ii(to_list, ha->addr, addr_len, 0);
+ }
+}
+
+static void __hw_addr_del_multiple(struct list_head *to_list,
+ struct list_head *from_list,
+ int addr_len)
+{
+ __hw_addr_del_multiple_ii(to_list, from_list, addr_len, -1);
+}
+
+static void __hw_addr_flush(struct list_head *list)
+{
+ struct netdev_hw_addr *ha, *tmp;
+
+ list_for_each_entry_safe(ha, tmp, list, list) {
+ list_del_rcu(&ha->list);
+ call_rcu(&ha->rcu_head, ha_rcu_free);
+ }
+}
+
+/* Device addresses handling functions */
+
+static void dev_addr_flush(struct net_device *dev)
+{
+ ASSERT_RTNL();
+
+ __hw_addr_flush(&dev->dev_addr_list);
+ dev->dev_addr = NULL;
+}
+
+static int dev_addr_init(struct net_device *dev)
+{
+ unsigned char addr[MAX_ADDR_LEN];
+ struct netdev_hw_addr *ha;
+ int err;
+
+ ASSERT_RTNL();
+
+ INIT_LIST_HEAD(&dev->dev_addr_list);
+ memset(addr, 0, sizeof(*addr));
+ err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
+ if (!err) {
+ /*
+ * Get the first (previously created) address from the list
+ * and set dev_addr pointer to this location.
+ */
+ ha = list_first_entry(&dev->dev_addr_list,
+ struct netdev_hw_addr, list);
+ dev->dev_addr = ha->addr;
+ }
+ return err;
+}
+
+/**
+ * dev_addr_add - Add a device address
+ * @dev: device
+ * @addr: address to add
+ *
+ * Add a device address to the device or increase the reference count if
+ * it already exists.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add(struct net_device *dev, unsigned char *addr)
+{
+ int err;
+
+ ASSERT_RTNL();
+
+ err = __hw_addr_add_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
+ if (!err)
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_addr_add);
+
+/**
+ * dev_addr_del - Release a device address.
+ * @dev: device
+ * @addr: address to delete
+ *
+ * Release reference to a device address and remove it from the device
+ * if the reference count drops to zero.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del(struct net_device *dev, unsigned char *addr)
+{
+ int err;
+
+ ASSERT_RTNL();
+
+ err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
+ if (!err)
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_addr_del);
+
+/**
+ * dev_addr_add_multiple - Add device addresses from another device
+ * @to_dev: device to which addresses will be added
+ * @from_dev: device from which addresses will be added
+ *
+ * Add device addresses of the one device to another.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add_multiple(struct net_device *to_dev,
+ struct net_device *from_dev)
+{
+ int err;
+
+ ASSERT_RTNL();
+
+ if (from_dev->addr_len != to_dev->addr_len)
+ return -EINVAL;
+ err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+ &from_dev->dev_addr_list,
+ to_dev->addr_len, 0);
+ if (!err)
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_addr_add_multiple);
+
+/**
+ * dev_addr_del_multiple - Delete device addresses by another device
+ * @to_dev: device where the addresses will be deleted
+ * @from_dev: device by which addresses the addresses will be deleted
+ *
+ * Deletes addresses in to device by the list of addresses in from device.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del_multiple(struct net_device *to_dev,
+ struct net_device *from_dev)
+{
+ ASSERT_RTNL();
+
+ if (from_dev->addr_len != to_dev->addr_len)
+ return -EINVAL;
+ __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+ &from_dev->dev_addr_list,
+ to_dev->addr_len, 0);
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+ return 0;
+}
+EXPORT_SYMBOL(dev_addr_del_multiple);
+
+/* unicast and multicast addresses handling functions */
+
int __dev_addr_delete(struct dev_addr_list **list, int *count,
void *addr, int alen, int glbl)
{
@@ -4780,6 +5037,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,

dev->gso_max_size = GSO_MAX_SIZE;

+ dev_addr_init(dev);
netdev_init_queues(dev);

INIT_LIST_HEAD(&dev->napi_list);
@@ -4805,6 +5063,9 @@ void free_netdev(struct net_device *dev)

kfree(dev->_tx);

+ /* Flush device addresses */
+ dev_addr_flush(dev);
+
list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
netif_napi_del(p);

--
1.6.0.6

2009-04-17 15:33:35

by Stephen Hemminger

[permalink] [raw]
Subject: Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v3)

On Fri, 17 Apr 2009 13:57:24 +0200
Jiri Pirko <[email protected]> wrote:

> v2 -> v3 (current):
> -removed unnecessary rcu read locking
> -moved dev_addr_flush() calling to ensure no null dereference of dev_addr
>
> v1 -> v2:
> -added forgotten ASSERT_RTNL to dev_addr_init and dev_addr_flush
> -removed unnecessary rcu_read locking in dev_addr_init
> -use compare_ether_addr_64bits instead of compare_ether_addr
> -use L1_CACHE_BYTES as size for allocating struct netdev_hw_addr
> -use call_rcu instead of rcu_synchronize
> -moved is_etherdev_addr into __KERNEL__ ifdef
>
> This patch introduces a new list in struct net_device and brings a set of
> functions to handle the work with device address list. The list is a replacement
> for the original dev_addr field and because in some situations there is need to
> carry several device addresses with the net device. To be backward compatible,
> dev_addr is made to point to the first member of the list so original drivers
> sees no difference.
>
> Signed-off-by: Jiri Pirko <[email protected]>
> ---
> include/linux/etherdevice.h | 27 +++++
> include/linux/netdevice.h | 32 +++++-
> net/core/dev.c | 261 +++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 318 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
> index a1f17ab..3d7a668 100644
> --- a/include/linux/etherdevice.h
> +++ b/include/linux/etherdevice.h
> @@ -182,6 +182,33 @@ static inline unsigned compare_ether_addr_64bits(const u8 addr1[6+2],
> return compare_ether_addr(addr1, addr2);
> #endif
> }
> +
> +/**
> + * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
> + * @dev: Pointer to a device structure
> + * @addr: Pointer to a six-byte array containing the Ethernet address
> + *
> + * Compare passed address with all addresses of the device. Return true if the
> + * address if one of the device addresses.
> + *
> + * Note that this function calls compare_ether_addr_64bits() so take care of
> + * the right padding.
> + */
> +static inline bool is_etherdev_addr(const struct net_device *dev,
> + const u8 addr[6 + 2])
> +{
> + struct netdev_hw_addr *ha;
> + int res = 1;
> +
> + rcu_read_lock();
> + for_each_dev_addr(dev, ha) {
> + res = compare_ether_addr_64bits(addr, ha->addr);
> + if (!res)
> + break;
> + }
> + rcu_read_unlock();
> + return !res;
> +}
> #endif /* __KERNEL__ */
>
> /**
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 2e7783f..89ad6d2 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -210,6 +210,13 @@ struct dev_addr_list
> #define dmi_users da_users
> #define dmi_gusers da_gusers
>
> +struct netdev_hw_addr {
> + struct list_head list;
> + unsigned char addr[MAX_ADDR_LEN];
> + int refcount;
> + struct rcu_head rcu_head;
> +};

Minor nit, the ordering of elements cause holes that might not be
needed.

Space saving? is rcu_head needed or would using synchronize_net
make code cleaner and save space.

> struct hh_cache
> {
> struct hh_cache *hh_next; /* Next entry */
> @@ -776,8 +783,11 @@ struct net_device
> */
> unsigned long last_rx; /* Time of last Rx */
> /* Interface address info used in eth_type_trans() */
> - unsigned char dev_addr[MAX_ADDR_LEN]; /* hw address, (before bcast
> - because most packets are unicast) */
> + unsigned char *dev_addr; /* hw address, (before bcast
> + because most packets are
> + unicast) */
> +
> + struct list_head dev_addr_list; /* list of device hw addresses */
>
> unsigned char broadcast[MAX_ADDR_LEN]; /* hw bcast add */
>
> @@ -1778,6 +1788,13 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
> spin_unlock_bh(&dev->addr_list_lock);
> }
>
> +/*
> + * dev_addr_list walker. Should be used only for read access. Call with
> + * rcu_read_lock held.
> + */
> +#define for_each_dev_addr(dev, ha) \
> + list_for_each_entry_rcu(ha, &dev->dev_addr_list, list)
> +
> /* These functions live elsewhere (drivers/net/net_init.c, but related) */
>
> extern void ether_setup(struct net_device *dev);
> @@ -1790,6 +1807,17 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
> alloc_netdev_mq(sizeof_priv, name, setup, 1)
> extern int register_netdev(struct net_device *dev);
> extern void unregister_netdev(struct net_device *dev);
> +
> +/* Functions used for device addresses handling */
> +extern int dev_addr_add(struct net_device *dev,
> + unsigned char *addr);
> +extern int dev_addr_del(struct net_device *dev,
> + unsigned char *addr);
> +extern int dev_addr_add_multiple(struct net_device *to_dev,
> + struct net_device *from_dev);
> +extern int dev_addr_del_multiple(struct net_device *to_dev,
> + struct net_device *from_dev);
> +
> /* Functions used for secondary unicast and multicast support */
> extern void dev_set_rx_mode(struct net_device *dev);
> extern void __dev_set_rx_mode(struct net_device *dev);
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 343883f..b4503ac 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3438,6 +3438,263 @@ void dev_set_rx_mode(struct net_device *dev)
> netif_addr_unlock_bh(dev);
> }
>
> +/* hw addresses list handling functions */
> +
> +static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
> + int addr_len, int ignore_index)
> +{
> + struct netdev_hw_addr *ha;
> + int i = 0;
> +
> + if (addr_len > MAX_ADDR_LEN)
> + return -EINVAL;
> +
> + list_for_each_entry(ha, list, list) {
> + if (i++ != ignore_index &&
> + !memcmp(ha->addr, addr, addr_len)) {
> + ha->refcount++;
> + return 0;
> + }
> + }
> +
> + ha = kzalloc(max(sizeof(*ha), L1_CACHE_BYTES), GFP_ATOMIC);
> + if (!ha)
> + return -ENOMEM;
Since you are initializing all fields, kzalloc isn't really needed

> + memcpy(ha->addr, addr, addr_len);
> + ha->refcount = 1;
> + list_add_tail_rcu(&ha->list, list);
> + return 0;
> +}
> +
> +static int __hw_addr_add(struct list_head *list, unsigned char *addr,
> + int addr_len)
> +{
> + return __hw_addr_add_ii(list, addr, addr_len, -1);
> +}
> +
> +static void ha_rcu_free(struct rcu_head *head)
> +{
> + struct netdev_hw_addr *ha;
> +
> + ha = container_of(head, struct netdev_hw_addr, rcu_head);
> + kfree(ha);
> +}
> +
> +static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
> + int addr_len, int ignore_index)
> +{
> + struct netdev_hw_addr *ha;
> + int i = 0;
> +
> + list_for_each_entry(ha, list, list) {
> + if (i++ != ignore_index &&
> + !memcmp(ha->addr, addr, addr_len)) {
> + if (--ha->refcount)
> + return 0;
> + list_del_rcu(&ha->list);
> + call_rcu(&ha->rcu_head, ha_rcu_free);
> + return 0;
> + }
> + }
> + return -ENOENT;
> +}
> +
> +static int __hw_addr_del(struct list_head *list, unsigned char *addr,
> + int addr_len)
> +{
> + return __hw_addr_del_ii(list, addr, addr_len, -1);
> +}
> +
> +static int __hw_addr_add_multiple_ii(struct list_head *to_list,
> + struct list_head *from_list,
> + int addr_len, int ignore_index)
> +{
> + int err;
> + struct netdev_hw_addr *ha, *ha2;
> +
> + list_for_each_entry(ha, from_list, list) {
> + err = __hw_addr_add_ii(to_list, ha->addr, addr_len, 0);
> + if (err)
> + goto unroll;
> + }
> + return 0;
> +
> +unroll:
> + list_for_each_entry(ha2, from_list, list) {
> + if (ha2 == ha)
> + break;
> + __hw_addr_del_ii(to_list, ha2->addr, addr_len, 0);
> + }
> + return err;
> +}
> +
> +static int __hw_addr_add_multiple(struct list_head *to_list,
> + struct list_head *from_list,
> + int addr_len)
> +{
> + return __hw_addr_add_multiple_ii(to_list, from_list, addr_len, -1);
> +}
> +
> +static void __hw_addr_del_multiple_ii(struct list_head *to_list,
> + struct list_head *from_list,
> + int addr_len, int ignore_index)
> +{
> + struct netdev_hw_addr *ha;
> +
> + list_for_each_entry(ha, from_list, list) {
> + __hw_addr_del_ii(to_list, ha->addr, addr_len, 0);
> + }
> +}
> +
> +static void __hw_addr_del_multiple(struct list_head *to_list,
> + struct list_head *from_list,
> + int addr_len)
> +{
> + __hw_addr_del_multiple_ii(to_list, from_list, addr_len, -1);
> +}
> +
> +static void __hw_addr_flush(struct list_head *list)
> +{
> + struct netdev_hw_addr *ha, *tmp;
> +
> + list_for_each_entry_safe(ha, tmp, list, list) {
> + list_del_rcu(&ha->list);
> + call_rcu(&ha->rcu_head, ha_rcu_free);
> + }
> +}
> +
> +/* Device addresses handling functions */
> +
> +static void dev_addr_flush(struct net_device *dev)
> +{
> + ASSERT_RTNL();
> +
Since this is local you should be able to audit all
the callers and remove this ASSERT.

> + __hw_addr_flush(&dev->dev_addr_list);
> + dev->dev_addr = NULL;
> +}
> +
> +static int dev_addr_init(struct net_device *dev)
> +{
> + unsigned char addr[MAX_ADDR_LEN];
> + struct netdev_hw_addr *ha;
> + int err;
> +
> + ASSERT_RTNL();
Ditto, ASSERT_RTNL makes sense for exposed kernel API and
initial testing.

> + INIT_LIST_HEAD(&dev->dev_addr_list);
> + memset(addr, 0, sizeof(*addr));
> + err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
> + if (!err) {
> + /*
> + * Get the first (previously created) address from the list
> + * and set dev_addr pointer to this location.
> + */
> + ha = list_first_entry(&dev->dev_addr_list,
> + struct netdev_hw_addr, list);
> + dev->dev_addr = ha->addr;
> + }
> + return err;
> +}
> +
> +/**
> + * dev_addr_add - Add a device address
> + * @dev: device
> + * @addr: address to add
> + *
> + * Add a device address to the device or increase the reference count if
> + * it already exists.
> + *
> + * The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_add(struct net_device *dev, unsigned char *addr)
> +{
> + int err;
> +
> + ASSERT_RTNL();
> +
> + err = __hw_addr_add_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
> + if (!err)
> + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
> + return err;
> +}
> +EXPORT_SYMBOL(dev_addr_add);
> +
> +/**
> + * dev_addr_del - Release a device address.
> + * @dev: device
> + * @addr: address to delete
> + *
> + * Release reference to a device address and remove it from the device
> + * if the reference count drops to zero.
> + *
> + * The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_del(struct net_device *dev, unsigned char *addr)
> +{
> + int err;
> +
> + ASSERT_RTNL();
> +
> + err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
> + if (!err)
> + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
> + return err;
> +}
> +EXPORT_SYMBOL(dev_addr_del);
> +
> +/**
> + * dev_addr_add_multiple - Add device addresses from another device
> + * @to_dev: device to which addresses will be added
> + * @from_dev: device from which addresses will be added
> + *
> + * Add device addresses of the one device to another.
> + *
> + * The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_add_multiple(struct net_device *to_dev,
> + struct net_device *from_dev)
> +{
> + int err;
> +
> + ASSERT_RTNL();
> +
> + if (from_dev->addr_len != to_dev->addr_len)
> + return -EINVAL;
> + err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
> + &from_dev->dev_addr_list,
> + to_dev->addr_len, 0);
> + if (!err)
> + call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
> + return err;
> +}
> +EXPORT_SYMBOL(dev_addr_add_multiple);
> +
> +/**
> + * dev_addr_del_multiple - Delete device addresses by another device
> + * @to_dev: device where the addresses will be deleted
> + * @from_dev: device by which addresses the addresses will be deleted
> + *
> + * Deletes addresses in to device by the list of addresses in from device.
> + *
> + * The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_del_multiple(struct net_device *to_dev,
> + struct net_device *from_dev)
> +{
> + ASSERT_RTNL();
> +
> + if (from_dev->addr_len != to_dev->addr_len)
> + return -EINVAL;
> + __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
> + &from_dev->dev_addr_list,
> + to_dev->addr_len, 0);
> + call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
> + return 0;
> +}
> +EXPORT_SYMBOL(dev_addr_del_multiple);
> +
> +/* unicast and multicast addresses handling functions */
> +
> int __dev_addr_delete(struct dev_addr_list **list, int *count,
> void *addr, int alen, int glbl)
> {
> @@ -4780,6 +5037,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
>
> dev->gso_max_size = GSO_MAX_SIZE;
>
> + dev_addr_init(dev);
> netdev_init_queues(dev);
>
> INIT_LIST_HEAD(&dev->napi_list);
> @@ -4805,6 +5063,9 @@ void free_netdev(struct net_device *dev)
>
> kfree(dev->_tx);
>
> + /* Flush device addresses */
> + dev_addr_flush(dev);
> +
> list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
> netif_napi_del(p);
>

2009-04-18 07:03:37

by Jiri Pirko

[permalink] [raw]
Subject: Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v3)

Fri, Apr 17, 2009 at 05:33:15PM CEST, [email protected] wrote:

<snip>

>> +struct netdev_hw_addr {
>> + struct list_head list;
>> + unsigned char addr[MAX_ADDR_LEN];
>> + int refcount;
>> + struct rcu_head rcu_head;
>> +};
>
>Minor nit, the ordering of elements cause holes that might not be
>needed.

Agree that ordering might be done better. Will do.
>
>Space saving? is rcu_head needed or would using synchronize_net
>make code cleaner and save space.
>

Well I originaly had this done by synchronize_rcu(). Eric argued that it might
cause especially __hw_addr_del_multiple_ii() to run long and suggested to use
call_rcu() instead. I plan to switch this to kfree_rcu() (or whatever it's
called) once it hits the tree.

<snip>

>> + ha = kzalloc(max(sizeof(*ha), L1_CACHE_BYTES), GFP_ATOMIC);
>> + if (!ha)
>> + return -ENOMEM;
>Since you are initializing all fields, kzalloc isn't really needed

Noted.
>
>> + memcpy(ha->addr, addr, addr_len);
>> + ha->refcount = 1;
>> + list_add_tail_rcu(&ha->list, list);
>> + return 0;
>> +}

<snip>

>> +static void dev_addr_flush(struct net_device *dev)
>> +{
>> + ASSERT_RTNL();
>> +
>Since this is local you should be able to audit all
>the callers and remove this ASSERT.

Okay. I will at least put a comment instead of this.
>
>> + __hw_addr_flush(&dev->dev_addr_list);
>> + dev->dev_addr = NULL;
>> +}
>> +
>> +static int dev_addr_init(struct net_device *dev)
>> +{
>> + unsigned char addr[MAX_ADDR_LEN];
>> + struct netdev_hw_addr *ha;
>> + int err;
>> +
>> + ASSERT_RTNL();
>Ditto, ASSERT_RTNL makes sense for exposed kernel API and
>initial testing.
>
>> + INIT_LIST_HEAD(&dev->dev_addr_list);
>> + memset(addr, 0, sizeof(*addr));
>> + err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
>> + if (!err) {
>> + /*
>> + * Get the first (previously created) address from the list
>> + * and set dev_addr pointer to this location.
>> + */
>> + ha = list_first_entry(&dev->dev_addr_list,
>> + struct netdev_hw_addr, list);
>> + dev->dev_addr = ha->addr;
>> + }
>> + return err;
>> +}

<snip>

2009-04-18 07:36:26

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v3)

Jiri Pirko a ?crit :
> Fri, Apr 17, 2009 at 05:33:15PM CEST, [email protected] wrote:
>
> <snip>
>
>>> +struct netdev_hw_addr {
>>> + struct list_head list;
>>> + unsigned char addr[MAX_ADDR_LEN];
>>> + int refcount;
>>> + struct rcu_head rcu_head;
>>> +};
>> Minor nit, the ordering of elements cause holes that might not be
>> needed.
>
> Agree that ordering might be done better. Will do.
>> Space saving? is rcu_head needed or would using synchronize_net
>> make code cleaner and save space.
>>
>
> Well I originaly had this done by synchronize_rcu(). Eric argued that it might
> cause especially __hw_addr_del_multiple_ii() to run long and suggested to use
> call_rcu() instead. I plan to switch this to kfree_rcu() (or whatever it's
> called) once it hits the tree.
>

Yes, and dont forget we wont save space, as we allocate a full
cache line to hold a 'struct netdev_hw_addr', since we dont want this
critical and read_mostly object polluted by a hot spot elsewhere in kernel...

Considering this, letting 'rcu_head' at the end of structure, even if we
have an eventual hole on 64 bit arches is not really a problem, and IMHO
the best thing to do, as rcu_head is only used at dismantle time.

And yes, maybe kfree_rcu() will makes its way in kernel, eventually :)

Thank you

2009-04-18 07:45:32

by Jiri Pirko

[permalink] [raw]
Subject: Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v3)

Sat, Apr 18, 2009 at 09:35:32AM CEST, [email protected] wrote:
>Jiri Pirko a ?crit :
>> Fri, Apr 17, 2009 at 05:33:15PM CEST, [email protected] wrote:
>>
>> <snip>
>>
>>>> +struct netdev_hw_addr {
>>>> + struct list_head list;
>>>> + unsigned char addr[MAX_ADDR_LEN];
>>>> + int refcount;
>>>> + struct rcu_head rcu_head;
>>>> +};
>>> Minor nit, the ordering of elements cause holes that might not be
>>> needed.
>>
>> Agree that ordering might be done better. Will do.
>>> Space saving? is rcu_head needed or would using synchronize_net
>>> make code cleaner and save space.
>>>
>>
>> Well I originaly had this done by synchronize_rcu(). Eric argued that it might
>> cause especially __hw_addr_del_multiple_ii() to run long and suggested to use
>> call_rcu() instead. I plan to switch this to kfree_rcu() (or whatever it's
>> called) once it hits the tree.
>>
>
>Yes, and dont forget we wont save space, as we allocate a full
>cache line to hold a 'struct netdev_hw_addr', since we dont want this
>critical and read_mostly object polluted by a hot spot elsewhere in kernel...
>
>Considering this, letting 'rcu_head' at the end of structure, even if we
>have an eventual hole on 64 bit arches is not really a problem, and IMHO
>the best thing to do, as rcu_head is only used at dismantle time.

I will order the struct better, there are archs with small cache line size where
it makes sense.

>
>And yes, maybe kfree_rcu() will makes its way in kernel, eventually :)
>
>Thank you
>
>

2009-04-18 08:09:00

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v3)

Jiri Pirko a ?crit :
> Sat, Apr 18, 2009 at 09:35:32AM CEST, [email protected] wrote:
>> Jiri Pirko a ?crit :
>>> Fri, Apr 17, 2009 at 05:33:15PM CEST, [email protected] wrote:
>>>
>>> <snip>
>>>
>>>>> +struct netdev_hw_addr {
>>>>> + struct list_head list;
>>>>> + unsigned char addr[MAX_ADDR_LEN];
>>>>> + int refcount;
>>>>> + struct rcu_head rcu_head;
>>>>> +};
>>>> Minor nit, the ordering of elements cause holes that might not be
>>>> needed.
>>> Agree that ordering might be done better. Will do.
>>>> Space saving? is rcu_head needed or would using synchronize_net
>>>> make code cleaner and save space.
>>>>
>>> Well I originaly had this done by synchronize_rcu(). Eric argued that it might
>>> cause especially __hw_addr_del_multiple_ii() to run long and suggested to use
>>> call_rcu() instead. I plan to switch this to kfree_rcu() (or whatever it's
>>> called) once it hits the tree.
>>>
>> Yes, and dont forget we wont save space, as we allocate a full
>> cache line to hold a 'struct netdev_hw_addr', since we dont want this
>> critical and read_mostly object polluted by a hot spot elsewhere in kernel...
>>
>> Considering this, letting 'rcu_head' at the end of structure, even if we
>> have an eventual hole on 64 bit arches is not really a problem, and IMHO
>> the best thing to do, as rcu_head is only used at dismantle time.
>
> I will order the struct better, there are archs with small cache line size where
> it makes sense.

How exactly ?

If you consider a 32bit arch with 16 or 32 bytes cache line,
sizeof(struct_list_dead) is 8
sizeof(addr) = 32 (but we really use 6 bytes for ethernet)

struct netdev_hw_addr {
unsigned char addr[MAX_ADDR_LEN];
struct list_head list;
int refcount;
struct rcu_head rcu_head;
};

would cost more at lookup time, since we would use two cache lines

struct netdev_hw_addr {
struct list_head list;
unsigned char addr[MAX_ADDR_LEN];
int refcount;
struct rcu_head rcu_head;
};

Is nicer, because at least 8 bytes of addr share the same cache line
than list. So direct dev->dev_addr would be fast (for devices with one
address), and is_etherdev_addr() would still use one cache line per
item.

2009-04-18 09:01:26

by Jiri Pirko

[permalink] [raw]
Subject: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v4)

v3 -> v4 (current):
-changed kzalloc to kmalloc in __hw_addr_add_ii()
-ASSERT_RTNL() avoided in dev_addr_flush() and dev_addr_init()

v2 -> v3:
-removed unnecessary rcu read locking
-moved dev_addr_flush() calling to ensure no null dereference of dev_addr

v1 -> v2:
-added forgotten ASSERT_RTNL to dev_addr_init and dev_addr_flush
-removed unnecessary rcu_read locking in dev_addr_init
-use compare_ether_addr_64bits instead of compare_ether_addr
-use L1_CACHE_BYTES as size for allocating struct netdev_hw_addr
-use call_rcu instead of rcu_synchronize
-moved is_etherdev_addr into __KERNEL__ ifdef

This patch introduces a new list in struct net_device and brings a set of
functions to handle the work with device address list. The list is a replacement
for the original dev_addr field and because in some situations there is need to
carry several device addresses with the net device. To be backward compatible,
dev_addr is made to point to the first member of the list so original drivers
sees no difference.

Signed-off-by: Jiri Pirko <[email protected]>
---
include/linux/etherdevice.h | 27 +++++
include/linux/netdevice.h | 32 +++++-
net/core/dev.c | 261 +++++++++++++++++++++++++++++++++++++++++++
3 files changed, 318 insertions(+), 2 deletions(-)

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index a1f17ab..3d7a668 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -182,6 +182,33 @@ static inline unsigned compare_ether_addr_64bits(const u8 addr1[6+2],
return compare_ether_addr(addr1, addr2);
#endif
}
+
+/**
+ * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
+ * @dev: Pointer to a device structure
+ * @addr: Pointer to a six-byte array containing the Ethernet address
+ *
+ * Compare passed address with all addresses of the device. Return true if the
+ * address if one of the device addresses.
+ *
+ * Note that this function calls compare_ether_addr_64bits() so take care of
+ * the right padding.
+ */
+static inline bool is_etherdev_addr(const struct net_device *dev,
+ const u8 addr[6 + 2])
+{
+ struct netdev_hw_addr *ha;
+ int res = 1;
+
+ rcu_read_lock();
+ for_each_dev_addr(dev, ha) {
+ res = compare_ether_addr_64bits(addr, ha->addr);
+ if (!res)
+ break;
+ }
+ rcu_read_unlock();
+ return !res;
+}
#endif /* __KERNEL__ */

/**
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2e7783f..89ad6d2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -210,6 +210,13 @@ struct dev_addr_list
#define dmi_users da_users
#define dmi_gusers da_gusers

+struct netdev_hw_addr {
+ struct list_head list;
+ unsigned char addr[MAX_ADDR_LEN];
+ int refcount;
+ struct rcu_head rcu_head;
+};
+
struct hh_cache
{
struct hh_cache *hh_next; /* Next entry */
@@ -776,8 +783,11 @@ struct net_device
*/
unsigned long last_rx; /* Time of last Rx */
/* Interface address info used in eth_type_trans() */
- unsigned char dev_addr[MAX_ADDR_LEN]; /* hw address, (before bcast
- because most packets are unicast) */
+ unsigned char *dev_addr; /* hw address, (before bcast
+ because most packets are
+ unicast) */
+
+ struct list_head dev_addr_list; /* list of device hw addresses */

unsigned char broadcast[MAX_ADDR_LEN]; /* hw bcast add */

@@ -1778,6 +1788,13 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
spin_unlock_bh(&dev->addr_list_lock);
}

+/*
+ * dev_addr_list walker. Should be used only for read access. Call with
+ * rcu_read_lock held.
+ */
+#define for_each_dev_addr(dev, ha) \
+ list_for_each_entry_rcu(ha, &dev->dev_addr_list, list)
+
/* These functions live elsewhere (drivers/net/net_init.c, but related) */

extern void ether_setup(struct net_device *dev);
@@ -1790,6 +1807,17 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
alloc_netdev_mq(sizeof_priv, name, setup, 1)
extern int register_netdev(struct net_device *dev);
extern void unregister_netdev(struct net_device *dev);
+
+/* Functions used for device addresses handling */
+extern int dev_addr_add(struct net_device *dev,
+ unsigned char *addr);
+extern int dev_addr_del(struct net_device *dev,
+ unsigned char *addr);
+extern int dev_addr_add_multiple(struct net_device *to_dev,
+ struct net_device *from_dev);
+extern int dev_addr_del_multiple(struct net_device *to_dev,
+ struct net_device *from_dev);
+
/* Functions used for secondary unicast and multicast support */
extern void dev_set_rx_mode(struct net_device *dev);
extern void __dev_set_rx_mode(struct net_device *dev);
diff --git a/net/core/dev.c b/net/core/dev.c
index 343883f..2274294 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3438,6 +3438,263 @@ void dev_set_rx_mode(struct net_device *dev)
netif_addr_unlock_bh(dev);
}

+/* hw addresses list handling functions */
+
+static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
+ int addr_len, int ignore_index)
+{
+ struct netdev_hw_addr *ha;
+ int i = 0;
+
+ if (addr_len > MAX_ADDR_LEN)
+ return -EINVAL;
+
+ list_for_each_entry(ha, list, list) {
+ if (i++ != ignore_index &&
+ !memcmp(ha->addr, addr, addr_len)) {
+ ha->refcount++;
+ return 0;
+ }
+ }
+
+ ha = kmalloc(max(sizeof(*ha), L1_CACHE_BYTES), GFP_ATOMIC);
+ if (!ha)
+ return -ENOMEM;
+ memcpy(ha->addr, addr, addr_len);
+ ha->refcount = 1;
+ list_add_tail_rcu(&ha->list, list);
+ return 0;
+}
+
+static int __hw_addr_add(struct list_head *list, unsigned char *addr,
+ int addr_len)
+{
+ return __hw_addr_add_ii(list, addr, addr_len, -1);
+}
+
+static void ha_rcu_free(struct rcu_head *head)
+{
+ struct netdev_hw_addr *ha;
+
+ ha = container_of(head, struct netdev_hw_addr, rcu_head);
+ kfree(ha);
+}
+
+static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
+ int addr_len, int ignore_index)
+{
+ struct netdev_hw_addr *ha;
+ int i = 0;
+
+ list_for_each_entry(ha, list, list) {
+ if (i++ != ignore_index &&
+ !memcmp(ha->addr, addr, addr_len)) {
+ if (--ha->refcount)
+ return 0;
+ list_del_rcu(&ha->list);
+ call_rcu(&ha->rcu_head, ha_rcu_free);
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+static int __hw_addr_del(struct list_head *list, unsigned char *addr,
+ int addr_len)
+{
+ return __hw_addr_del_ii(list, addr, addr_len, -1);
+}
+
+static int __hw_addr_add_multiple_ii(struct list_head *to_list,
+ struct list_head *from_list,
+ int addr_len, int ignore_index)
+{
+ int err;
+ struct netdev_hw_addr *ha, *ha2;
+
+ list_for_each_entry(ha, from_list, list) {
+ err = __hw_addr_add_ii(to_list, ha->addr, addr_len, 0);
+ if (err)
+ goto unroll;
+ }
+ return 0;
+
+unroll:
+ list_for_each_entry(ha2, from_list, list) {
+ if (ha2 == ha)
+ break;
+ __hw_addr_del_ii(to_list, ha2->addr, addr_len, 0);
+ }
+ return err;
+}
+
+static int __hw_addr_add_multiple(struct list_head *to_list,
+ struct list_head *from_list,
+ int addr_len)
+{
+ return __hw_addr_add_multiple_ii(to_list, from_list, addr_len, -1);
+}
+
+static void __hw_addr_del_multiple_ii(struct list_head *to_list,
+ struct list_head *from_list,
+ int addr_len, int ignore_index)
+{
+ struct netdev_hw_addr *ha;
+
+ list_for_each_entry(ha, from_list, list) {
+ __hw_addr_del_ii(to_list, ha->addr, addr_len, 0);
+ }
+}
+
+static void __hw_addr_del_multiple(struct list_head *to_list,
+ struct list_head *from_list,
+ int addr_len)
+{
+ __hw_addr_del_multiple_ii(to_list, from_list, addr_len, -1);
+}
+
+static void __hw_addr_flush(struct list_head *list)
+{
+ struct netdev_hw_addr *ha, *tmp;
+
+ list_for_each_entry_safe(ha, tmp, list, list) {
+ list_del_rcu(&ha->list);
+ call_rcu(&ha->rcu_head, ha_rcu_free);
+ }
+}
+
+/* Device addresses handling functions */
+
+static void dev_addr_flush(struct net_device *dev)
+{
+ /* rtnl_mutex must be held here */
+
+ __hw_addr_flush(&dev->dev_addr_list);
+ dev->dev_addr = NULL;
+}
+
+static int dev_addr_init(struct net_device *dev)
+{
+ unsigned char addr[MAX_ADDR_LEN];
+ struct netdev_hw_addr *ha;
+ int err;
+
+ /* rtnl_mutex must be held here */
+
+ INIT_LIST_HEAD(&dev->dev_addr_list);
+ memset(addr, 0, sizeof(*addr));
+ err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
+ if (!err) {
+ /*
+ * Get the first (previously created) address from the list
+ * and set dev_addr pointer to this location.
+ */
+ ha = list_first_entry(&dev->dev_addr_list,
+ struct netdev_hw_addr, list);
+ dev->dev_addr = ha->addr;
+ }
+ return err;
+}
+
+/**
+ * dev_addr_add - Add a device address
+ * @dev: device
+ * @addr: address to add
+ *
+ * Add a device address to the device or increase the reference count if
+ * it already exists.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add(struct net_device *dev, unsigned char *addr)
+{
+ int err;
+
+ ASSERT_RTNL();
+
+ err = __hw_addr_add_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
+ if (!err)
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_addr_add);
+
+/**
+ * dev_addr_del - Release a device address.
+ * @dev: device
+ * @addr: address to delete
+ *
+ * Release reference to a device address and remove it from the device
+ * if the reference count drops to zero.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del(struct net_device *dev, unsigned char *addr)
+{
+ int err;
+
+ ASSERT_RTNL();
+
+ err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
+ if (!err)
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_addr_del);
+
+/**
+ * dev_addr_add_multiple - Add device addresses from another device
+ * @to_dev: device to which addresses will be added
+ * @from_dev: device from which addresses will be added
+ *
+ * Add device addresses of the one device to another.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add_multiple(struct net_device *to_dev,
+ struct net_device *from_dev)
+{
+ int err;
+
+ ASSERT_RTNL();
+
+ if (from_dev->addr_len != to_dev->addr_len)
+ return -EINVAL;
+ err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+ &from_dev->dev_addr_list,
+ to_dev->addr_len, 0);
+ if (!err)
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_addr_add_multiple);
+
+/**
+ * dev_addr_del_multiple - Delete device addresses by another device
+ * @to_dev: device where the addresses will be deleted
+ * @from_dev: device by which addresses the addresses will be deleted
+ *
+ * Deletes addresses in to device by the list of addresses in from device.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del_multiple(struct net_device *to_dev,
+ struct net_device *from_dev)
+{
+ ASSERT_RTNL();
+
+ if (from_dev->addr_len != to_dev->addr_len)
+ return -EINVAL;
+ __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+ &from_dev->dev_addr_list,
+ to_dev->addr_len, 0);
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+ return 0;
+}
+EXPORT_SYMBOL(dev_addr_del_multiple);
+
+/* unicast and multicast addresses handling functions */
+
int __dev_addr_delete(struct dev_addr_list **list, int *count,
void *addr, int alen, int glbl)
{
@@ -4780,6 +5037,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,

dev->gso_max_size = GSO_MAX_SIZE;

+ dev_addr_init(dev);
netdev_init_queues(dev);

INIT_LIST_HEAD(&dev->napi_list);
@@ -4805,6 +5063,9 @@ void free_netdev(struct net_device *dev)

kfree(dev->_tx);

+ /* Flush device addresses */
+ dev_addr_flush(dev);
+
list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
netif_napi_del(p);

--
1.6.0.6

2009-04-20 16:15:38

by Jiri Pirko

[permalink] [raw]
Subject: Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v4)

How about this (and another 2 patches in patchset)? What's your opinion guys?

Thanks,

Jirka

Sat, Apr 18, 2009 at 10:58:49AM CEST, [email protected] wrote:
>v3 -> v4 (current):
>-changed kzalloc to kmalloc in __hw_addr_add_ii()
>-ASSERT_RTNL() avoided in dev_addr_flush() and dev_addr_init()
>
>v2 -> v3:
>-removed unnecessary rcu read locking
>-moved dev_addr_flush() calling to ensure no null dereference of dev_addr
>
>v1 -> v2:
>-added forgotten ASSERT_RTNL to dev_addr_init and dev_addr_flush
>-removed unnecessary rcu_read locking in dev_addr_init
>-use compare_ether_addr_64bits instead of compare_ether_addr
>-use L1_CACHE_BYTES as size for allocating struct netdev_hw_addr
>-use call_rcu instead of rcu_synchronize
>-moved is_etherdev_addr into __KERNEL__ ifdef
>
>This patch introduces a new list in struct net_device and brings a set of
>functions to handle the work with device address list. The list is a replacement
>for the original dev_addr field and because in some situations there is need to
>carry several device addresses with the net device. To be backward compatible,
>dev_addr is made to point to the first member of the list so original drivers
>sees no difference.
>
>Signed-off-by: Jiri Pirko <[email protected]>
>---
> include/linux/etherdevice.h | 27 +++++
> include/linux/netdevice.h | 32 +++++-
> net/core/dev.c | 261 +++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 318 insertions(+), 2 deletions(-)
>
>diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
>index a1f17ab..3d7a668 100644
>--- a/include/linux/etherdevice.h
>+++ b/include/linux/etherdevice.h
>@@ -182,6 +182,33 @@ static inline unsigned compare_ether_addr_64bits(const u8 addr1[6+2],
> return compare_ether_addr(addr1, addr2);
> #endif
> }
>+
>+/**
>+ * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
>+ * @dev: Pointer to a device structure
>+ * @addr: Pointer to a six-byte array containing the Ethernet address
>+ *
>+ * Compare passed address with all addresses of the device. Return true if the
>+ * address if one of the device addresses.
>+ *
>+ * Note that this function calls compare_ether_addr_64bits() so take care of
>+ * the right padding.
>+ */
>+static inline bool is_etherdev_addr(const struct net_device *dev,
>+ const u8 addr[6 + 2])
>+{
>+ struct netdev_hw_addr *ha;
>+ int res = 1;
>+
>+ rcu_read_lock();
>+ for_each_dev_addr(dev, ha) {
>+ res = compare_ether_addr_64bits(addr, ha->addr);
>+ if (!res)
>+ break;
>+ }
>+ rcu_read_unlock();
>+ return !res;
>+}
> #endif /* __KERNEL__ */
>
> /**
>diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>index 2e7783f..89ad6d2 100644
>--- a/include/linux/netdevice.h
>+++ b/include/linux/netdevice.h
>@@ -210,6 +210,13 @@ struct dev_addr_list
> #define dmi_users da_users
> #define dmi_gusers da_gusers
>
>+struct netdev_hw_addr {
>+ struct list_head list;
>+ unsigned char addr[MAX_ADDR_LEN];
>+ int refcount;
>+ struct rcu_head rcu_head;
>+};
>+
> struct hh_cache
> {
> struct hh_cache *hh_next; /* Next entry */
>@@ -776,8 +783,11 @@ struct net_device
> */
> unsigned long last_rx; /* Time of last Rx */
> /* Interface address info used in eth_type_trans() */
>- unsigned char dev_addr[MAX_ADDR_LEN]; /* hw address, (before bcast
>- because most packets are unicast) */
>+ unsigned char *dev_addr; /* hw address, (before bcast
>+ because most packets are
>+ unicast) */
>+
>+ struct list_head dev_addr_list; /* list of device hw addresses */
>
> unsigned char broadcast[MAX_ADDR_LEN]; /* hw bcast add */
>
>@@ -1778,6 +1788,13 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
> spin_unlock_bh(&dev->addr_list_lock);
> }
>
>+/*
>+ * dev_addr_list walker. Should be used only for read access. Call with
>+ * rcu_read_lock held.
>+ */
>+#define for_each_dev_addr(dev, ha) \
>+ list_for_each_entry_rcu(ha, &dev->dev_addr_list, list)
>+
> /* These functions live elsewhere (drivers/net/net_init.c, but related) */
>
> extern void ether_setup(struct net_device *dev);
>@@ -1790,6 +1807,17 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
> alloc_netdev_mq(sizeof_priv, name, setup, 1)
> extern int register_netdev(struct net_device *dev);
> extern void unregister_netdev(struct net_device *dev);
>+
>+/* Functions used for device addresses handling */
>+extern int dev_addr_add(struct net_device *dev,
>+ unsigned char *addr);
>+extern int dev_addr_del(struct net_device *dev,
>+ unsigned char *addr);
>+extern int dev_addr_add_multiple(struct net_device *to_dev,
>+ struct net_device *from_dev);
>+extern int dev_addr_del_multiple(struct net_device *to_dev,
>+ struct net_device *from_dev);
>+
> /* Functions used for secondary unicast and multicast support */
> extern void dev_set_rx_mode(struct net_device *dev);
> extern void __dev_set_rx_mode(struct net_device *dev);
>diff --git a/net/core/dev.c b/net/core/dev.c
>index 343883f..2274294 100644
>--- a/net/core/dev.c
>+++ b/net/core/dev.c
>@@ -3438,6 +3438,263 @@ void dev_set_rx_mode(struct net_device *dev)
> netif_addr_unlock_bh(dev);
> }
>
>+/* hw addresses list handling functions */
>+
>+static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
>+ int addr_len, int ignore_index)
>+{
>+ struct netdev_hw_addr *ha;
>+ int i = 0;
>+
>+ if (addr_len > MAX_ADDR_LEN)
>+ return -EINVAL;
>+
>+ list_for_each_entry(ha, list, list) {
>+ if (i++ != ignore_index &&
>+ !memcmp(ha->addr, addr, addr_len)) {
>+ ha->refcount++;
>+ return 0;
>+ }
>+ }
>+
>+ ha = kmalloc(max(sizeof(*ha), L1_CACHE_BYTES), GFP_ATOMIC);
>+ if (!ha)
>+ return -ENOMEM;
>+ memcpy(ha->addr, addr, addr_len);
>+ ha->refcount = 1;
>+ list_add_tail_rcu(&ha->list, list);
>+ return 0;
>+}
>+
>+static int __hw_addr_add(struct list_head *list, unsigned char *addr,
>+ int addr_len)
>+{
>+ return __hw_addr_add_ii(list, addr, addr_len, -1);
>+}
>+
>+static void ha_rcu_free(struct rcu_head *head)
>+{
>+ struct netdev_hw_addr *ha;
>+
>+ ha = container_of(head, struct netdev_hw_addr, rcu_head);
>+ kfree(ha);
>+}
>+
>+static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
>+ int addr_len, int ignore_index)
>+{
>+ struct netdev_hw_addr *ha;
>+ int i = 0;
>+
>+ list_for_each_entry(ha, list, list) {
>+ if (i++ != ignore_index &&
>+ !memcmp(ha->addr, addr, addr_len)) {
>+ if (--ha->refcount)
>+ return 0;
>+ list_del_rcu(&ha->list);
>+ call_rcu(&ha->rcu_head, ha_rcu_free);
>+ return 0;
>+ }
>+ }
>+ return -ENOENT;
>+}
>+
>+static int __hw_addr_del(struct list_head *list, unsigned char *addr,
>+ int addr_len)
>+{
>+ return __hw_addr_del_ii(list, addr, addr_len, -1);
>+}
>+
>+static int __hw_addr_add_multiple_ii(struct list_head *to_list,
>+ struct list_head *from_list,
>+ int addr_len, int ignore_index)
>+{
>+ int err;
>+ struct netdev_hw_addr *ha, *ha2;
>+
>+ list_for_each_entry(ha, from_list, list) {
>+ err = __hw_addr_add_ii(to_list, ha->addr, addr_len, 0);
>+ if (err)
>+ goto unroll;
>+ }
>+ return 0;
>+
>+unroll:
>+ list_for_each_entry(ha2, from_list, list) {
>+ if (ha2 == ha)
>+ break;
>+ __hw_addr_del_ii(to_list, ha2->addr, addr_len, 0);
>+ }
>+ return err;
>+}
>+
>+static int __hw_addr_add_multiple(struct list_head *to_list,
>+ struct list_head *from_list,
>+ int addr_len)
>+{
>+ return __hw_addr_add_multiple_ii(to_list, from_list, addr_len, -1);
>+}
>+
>+static void __hw_addr_del_multiple_ii(struct list_head *to_list,
>+ struct list_head *from_list,
>+ int addr_len, int ignore_index)
>+{
>+ struct netdev_hw_addr *ha;
>+
>+ list_for_each_entry(ha, from_list, list) {
>+ __hw_addr_del_ii(to_list, ha->addr, addr_len, 0);
>+ }
>+}
>+
>+static void __hw_addr_del_multiple(struct list_head *to_list,
>+ struct list_head *from_list,
>+ int addr_len)
>+{
>+ __hw_addr_del_multiple_ii(to_list, from_list, addr_len, -1);
>+}
>+
>+static void __hw_addr_flush(struct list_head *list)
>+{
>+ struct netdev_hw_addr *ha, *tmp;
>+
>+ list_for_each_entry_safe(ha, tmp, list, list) {
>+ list_del_rcu(&ha->list);
>+ call_rcu(&ha->rcu_head, ha_rcu_free);
>+ }
>+}
>+
>+/* Device addresses handling functions */
>+
>+static void dev_addr_flush(struct net_device *dev)
>+{
>+ /* rtnl_mutex must be held here */
>+
>+ __hw_addr_flush(&dev->dev_addr_list);
>+ dev->dev_addr = NULL;
>+}
>+
>+static int dev_addr_init(struct net_device *dev)
>+{
>+ unsigned char addr[MAX_ADDR_LEN];
>+ struct netdev_hw_addr *ha;
>+ int err;
>+
>+ /* rtnl_mutex must be held here */
>+
>+ INIT_LIST_HEAD(&dev->dev_addr_list);
>+ memset(addr, 0, sizeof(*addr));
>+ err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
>+ if (!err) {
>+ /*
>+ * Get the first (previously created) address from the list
>+ * and set dev_addr pointer to this location.
>+ */
>+ ha = list_first_entry(&dev->dev_addr_list,
>+ struct netdev_hw_addr, list);
>+ dev->dev_addr = ha->addr;
>+ }
>+ return err;
>+}
>+
>+/**
>+ * dev_addr_add - Add a device address
>+ * @dev: device
>+ * @addr: address to add
>+ *
>+ * Add a device address to the device or increase the reference count if
>+ * it already exists.
>+ *
>+ * The caller must hold the rtnl_mutex.
>+ */
>+int dev_addr_add(struct net_device *dev, unsigned char *addr)
>+{
>+ int err;
>+
>+ ASSERT_RTNL();
>+
>+ err = __hw_addr_add_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
>+ if (!err)
>+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
>+ return err;
>+}
>+EXPORT_SYMBOL(dev_addr_add);
>+
>+/**
>+ * dev_addr_del - Release a device address.
>+ * @dev: device
>+ * @addr: address to delete
>+ *
>+ * Release reference to a device address and remove it from the device
>+ * if the reference count drops to zero.
>+ *
>+ * The caller must hold the rtnl_mutex.
>+ */
>+int dev_addr_del(struct net_device *dev, unsigned char *addr)
>+{
>+ int err;
>+
>+ ASSERT_RTNL();
>+
>+ err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
>+ if (!err)
>+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
>+ return err;
>+}
>+EXPORT_SYMBOL(dev_addr_del);
>+
>+/**
>+ * dev_addr_add_multiple - Add device addresses from another device
>+ * @to_dev: device to which addresses will be added
>+ * @from_dev: device from which addresses will be added
>+ *
>+ * Add device addresses of the one device to another.
>+ *
>+ * The caller must hold the rtnl_mutex.
>+ */
>+int dev_addr_add_multiple(struct net_device *to_dev,
>+ struct net_device *from_dev)
>+{
>+ int err;
>+
>+ ASSERT_RTNL();
>+
>+ if (from_dev->addr_len != to_dev->addr_len)
>+ return -EINVAL;
>+ err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
>+ &from_dev->dev_addr_list,
>+ to_dev->addr_len, 0);
>+ if (!err)
>+ call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
>+ return err;
>+}
>+EXPORT_SYMBOL(dev_addr_add_multiple);
>+
>+/**
>+ * dev_addr_del_multiple - Delete device addresses by another device
>+ * @to_dev: device where the addresses will be deleted
>+ * @from_dev: device by which addresses the addresses will be deleted
>+ *
>+ * Deletes addresses in to device by the list of addresses in from device.
>+ *
>+ * The caller must hold the rtnl_mutex.
>+ */
>+int dev_addr_del_multiple(struct net_device *to_dev,
>+ struct net_device *from_dev)
>+{
>+ ASSERT_RTNL();
>+
>+ if (from_dev->addr_len != to_dev->addr_len)
>+ return -EINVAL;
>+ __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
>+ &from_dev->dev_addr_list,
>+ to_dev->addr_len, 0);
>+ call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
>+ return 0;
>+}
>+EXPORT_SYMBOL(dev_addr_del_multiple);
>+
>+/* unicast and multicast addresses handling functions */
>+
> int __dev_addr_delete(struct dev_addr_list **list, int *count,
> void *addr, int alen, int glbl)
> {
>@@ -4780,6 +5037,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
>
> dev->gso_max_size = GSO_MAX_SIZE;
>
>+ dev_addr_init(dev);
> netdev_init_queues(dev);
>
> INIT_LIST_HEAD(&dev->napi_list);
>@@ -4805,6 +5063,9 @@ void free_netdev(struct net_device *dev)
>
> kfree(dev->_tx);
>
>+ /* Flush device addresses */
>+ dev_addr_flush(dev);
>+
> list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
> netif_napi_del(p);
>
>--
>1.6.0.6
>

2009-04-23 08:16:51

by Jiri Pirko

[permalink] [raw]
Subject: Re: [PATCH 1/3] net: introduce a list of device addresses dev_addr_list (v4)

Mon, Apr 20, 2009 at 06:11:56PM CEST, [email protected] wrote:
>How about this (and another 2 patches in patchset)? What's your opinion guys?

Eric, Stephen, Patrick? Can you please look at this? Any objections?
>
>Thanks,
>
>Jirka
>
>Sat, Apr 18, 2009 at 10:58:49AM CEST, [email protected] wrote:
>>v3 -> v4 (current):
>>-changed kzalloc to kmalloc in __hw_addr_add_ii()
>>-ASSERT_RTNL() avoided in dev_addr_flush() and dev_addr_init()
>>
>>v2 -> v3:
>>-removed unnecessary rcu read locking
>>-moved dev_addr_flush() calling to ensure no null dereference of dev_addr
>>
>>v1 -> v2:
>>-added forgotten ASSERT_RTNL to dev_addr_init and dev_addr_flush
>>-removed unnecessary rcu_read locking in dev_addr_init
>>-use compare_ether_addr_64bits instead of compare_ether_addr
>>-use L1_CACHE_BYTES as size for allocating struct netdev_hw_addr
>>-use call_rcu instead of rcu_synchronize
>>-moved is_etherdev_addr into __KERNEL__ ifdef
>>
>>This patch introduces a new list in struct net_device and brings a set of
>>functions to handle the work with device address list. The list is a replacement
>>for the original dev_addr field and because in some situations there is need to
>>carry several device addresses with the net device. To be backward compatible,
>>dev_addr is made to point to the first member of the list so original drivers
>>sees no difference.
>>
>>Signed-off-by: Jiri Pirko <[email protected]>
>>---
>> include/linux/etherdevice.h | 27 +++++
>> include/linux/netdevice.h | 32 +++++-
>> net/core/dev.c | 261 +++++++++++++++++++++++++++++++++++++++++++
>> 3 files changed, 318 insertions(+), 2 deletions(-)
>>
>>diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
>>index a1f17ab..3d7a668 100644
>>--- a/include/linux/etherdevice.h
>>+++ b/include/linux/etherdevice.h
>>@@ -182,6 +182,33 @@ static inline unsigned compare_ether_addr_64bits(const u8 addr1[6+2],
>> return compare_ether_addr(addr1, addr2);
>> #endif
>> }
>>+
>>+/**
>>+ * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
>>+ * @dev: Pointer to a device structure
>>+ * @addr: Pointer to a six-byte array containing the Ethernet address
>>+ *
>>+ * Compare passed address with all addresses of the device. Return true if the
>>+ * address if one of the device addresses.
>>+ *
>>+ * Note that this function calls compare_ether_addr_64bits() so take care of
>>+ * the right padding.
>>+ */
>>+static inline bool is_etherdev_addr(const struct net_device *dev,
>>+ const u8 addr[6 + 2])
>>+{
>>+ struct netdev_hw_addr *ha;
>>+ int res = 1;
>>+
>>+ rcu_read_lock();
>>+ for_each_dev_addr(dev, ha) {
>>+ res = compare_ether_addr_64bits(addr, ha->addr);
>>+ if (!res)
>>+ break;
>>+ }
>>+ rcu_read_unlock();
>>+ return !res;
>>+}
>> #endif /* __KERNEL__ */
>>
>> /**
>>diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>>index 2e7783f..89ad6d2 100644
>>--- a/include/linux/netdevice.h
>>+++ b/include/linux/netdevice.h
>>@@ -210,6 +210,13 @@ struct dev_addr_list
>> #define dmi_users da_users
>> #define dmi_gusers da_gusers
>>
>>+struct netdev_hw_addr {
>>+ struct list_head list;
>>+ unsigned char addr[MAX_ADDR_LEN];
>>+ int refcount;
>>+ struct rcu_head rcu_head;
>>+};
>>+
>> struct hh_cache
>> {
>> struct hh_cache *hh_next; /* Next entry */
>>@@ -776,8 +783,11 @@ struct net_device
>> */
>> unsigned long last_rx; /* Time of last Rx */
>> /* Interface address info used in eth_type_trans() */
>>- unsigned char dev_addr[MAX_ADDR_LEN]; /* hw address, (before bcast
>>- because most packets are unicast) */
>>+ unsigned char *dev_addr; /* hw address, (before bcast
>>+ because most packets are
>>+ unicast) */
>>+
>>+ struct list_head dev_addr_list; /* list of device hw addresses */
>>
>> unsigned char broadcast[MAX_ADDR_LEN]; /* hw bcast add */
>>
>>@@ -1778,6 +1788,13 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
>> spin_unlock_bh(&dev->addr_list_lock);
>> }
>>
>>+/*
>>+ * dev_addr_list walker. Should be used only for read access. Call with
>>+ * rcu_read_lock held.
>>+ */
>>+#define for_each_dev_addr(dev, ha) \
>>+ list_for_each_entry_rcu(ha, &dev->dev_addr_list, list)
>>+
>> /* These functions live elsewhere (drivers/net/net_init.c, but related) */
>>
>> extern void ether_setup(struct net_device *dev);
>>@@ -1790,6 +1807,17 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
>> alloc_netdev_mq(sizeof_priv, name, setup, 1)
>> extern int register_netdev(struct net_device *dev);
>> extern void unregister_netdev(struct net_device *dev);
>>+
>>+/* Functions used for device addresses handling */
>>+extern int dev_addr_add(struct net_device *dev,
>>+ unsigned char *addr);
>>+extern int dev_addr_del(struct net_device *dev,
>>+ unsigned char *addr);
>>+extern int dev_addr_add_multiple(struct net_device *to_dev,
>>+ struct net_device *from_dev);
>>+extern int dev_addr_del_multiple(struct net_device *to_dev,
>>+ struct net_device *from_dev);
>>+
>> /* Functions used for secondary unicast and multicast support */
>> extern void dev_set_rx_mode(struct net_device *dev);
>> extern void __dev_set_rx_mode(struct net_device *dev);
>>diff --git a/net/core/dev.c b/net/core/dev.c
>>index 343883f..2274294 100644
>>--- a/net/core/dev.c
>>+++ b/net/core/dev.c
>>@@ -3438,6 +3438,263 @@ void dev_set_rx_mode(struct net_device *dev)
>> netif_addr_unlock_bh(dev);
>> }
>>
>>+/* hw addresses list handling functions */
>>+
>>+static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
>>+ int addr_len, int ignore_index)
>>+{
>>+ struct netdev_hw_addr *ha;
>>+ int i = 0;
>>+
>>+ if (addr_len > MAX_ADDR_LEN)
>>+ return -EINVAL;
>>+
>>+ list_for_each_entry(ha, list, list) {
>>+ if (i++ != ignore_index &&
>>+ !memcmp(ha->addr, addr, addr_len)) {
>>+ ha->refcount++;
>>+ return 0;
>>+ }
>>+ }
>>+
>>+ ha = kmalloc(max(sizeof(*ha), L1_CACHE_BYTES), GFP_ATOMIC);
>>+ if (!ha)
>>+ return -ENOMEM;
>>+ memcpy(ha->addr, addr, addr_len);
>>+ ha->refcount = 1;
>>+ list_add_tail_rcu(&ha->list, list);
>>+ return 0;
>>+}
>>+
>>+static int __hw_addr_add(struct list_head *list, unsigned char *addr,
>>+ int addr_len)
>>+{
>>+ return __hw_addr_add_ii(list, addr, addr_len, -1);
>>+}
>>+
>>+static void ha_rcu_free(struct rcu_head *head)
>>+{
>>+ struct netdev_hw_addr *ha;
>>+
>>+ ha = container_of(head, struct netdev_hw_addr, rcu_head);
>>+ kfree(ha);
>>+}
>>+
>>+static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
>>+ int addr_len, int ignore_index)
>>+{
>>+ struct netdev_hw_addr *ha;
>>+ int i = 0;
>>+
>>+ list_for_each_entry(ha, list, list) {
>>+ if (i++ != ignore_index &&
>>+ !memcmp(ha->addr, addr, addr_len)) {
>>+ if (--ha->refcount)
>>+ return 0;
>>+ list_del_rcu(&ha->list);
>>+ call_rcu(&ha->rcu_head, ha_rcu_free);
>>+ return 0;
>>+ }
>>+ }
>>+ return -ENOENT;
>>+}
>>+
>>+static int __hw_addr_del(struct list_head *list, unsigned char *addr,
>>+ int addr_len)
>>+{
>>+ return __hw_addr_del_ii(list, addr, addr_len, -1);
>>+}
>>+
>>+static int __hw_addr_add_multiple_ii(struct list_head *to_list,
>>+ struct list_head *from_list,
>>+ int addr_len, int ignore_index)
>>+{
>>+ int err;
>>+ struct netdev_hw_addr *ha, *ha2;
>>+
>>+ list_for_each_entry(ha, from_list, list) {
>>+ err = __hw_addr_add_ii(to_list, ha->addr, addr_len, 0);
>>+ if (err)
>>+ goto unroll;
>>+ }
>>+ return 0;
>>+
>>+unroll:
>>+ list_for_each_entry(ha2, from_list, list) {
>>+ if (ha2 == ha)
>>+ break;
>>+ __hw_addr_del_ii(to_list, ha2->addr, addr_len, 0);
>>+ }
>>+ return err;
>>+}
>>+
>>+static int __hw_addr_add_multiple(struct list_head *to_list,
>>+ struct list_head *from_list,
>>+ int addr_len)
>>+{
>>+ return __hw_addr_add_multiple_ii(to_list, from_list, addr_len, -1);
>>+}
>>+
>>+static void __hw_addr_del_multiple_ii(struct list_head *to_list,
>>+ struct list_head *from_list,
>>+ int addr_len, int ignore_index)
>>+{
>>+ struct netdev_hw_addr *ha;
>>+
>>+ list_for_each_entry(ha, from_list, list) {
>>+ __hw_addr_del_ii(to_list, ha->addr, addr_len, 0);
>>+ }
>>+}
>>+
>>+static void __hw_addr_del_multiple(struct list_head *to_list,
>>+ struct list_head *from_list,
>>+ int addr_len)
>>+{
>>+ __hw_addr_del_multiple_ii(to_list, from_list, addr_len, -1);
>>+}
>>+
>>+static void __hw_addr_flush(struct list_head *list)
>>+{
>>+ struct netdev_hw_addr *ha, *tmp;
>>+
>>+ list_for_each_entry_safe(ha, tmp, list, list) {
>>+ list_del_rcu(&ha->list);
>>+ call_rcu(&ha->rcu_head, ha_rcu_free);
>>+ }
>>+}
>>+
>>+/* Device addresses handling functions */
>>+
>>+static void dev_addr_flush(struct net_device *dev)
>>+{
>>+ /* rtnl_mutex must be held here */
>>+
>>+ __hw_addr_flush(&dev->dev_addr_list);
>>+ dev->dev_addr = NULL;
>>+}
>>+
>>+static int dev_addr_init(struct net_device *dev)
>>+{
>>+ unsigned char addr[MAX_ADDR_LEN];
>>+ struct netdev_hw_addr *ha;
>>+ int err;
>>+
>>+ /* rtnl_mutex must be held here */
>>+
>>+ INIT_LIST_HEAD(&dev->dev_addr_list);
>>+ memset(addr, 0, sizeof(*addr));
>>+ err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
>>+ if (!err) {
>>+ /*
>>+ * Get the first (previously created) address from the list
>>+ * and set dev_addr pointer to this location.
>>+ */
>>+ ha = list_first_entry(&dev->dev_addr_list,
>>+ struct netdev_hw_addr, list);
>>+ dev->dev_addr = ha->addr;
>>+ }
>>+ return err;
>>+}
>>+
>>+/**
>>+ * dev_addr_add - Add a device address
>>+ * @dev: device
>>+ * @addr: address to add
>>+ *
>>+ * Add a device address to the device or increase the reference count if
>>+ * it already exists.
>>+ *
>>+ * The caller must hold the rtnl_mutex.
>>+ */
>>+int dev_addr_add(struct net_device *dev, unsigned char *addr)
>>+{
>>+ int err;
>>+
>>+ ASSERT_RTNL();
>>+
>>+ err = __hw_addr_add_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
>>+ if (!err)
>>+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
>>+ return err;
>>+}
>>+EXPORT_SYMBOL(dev_addr_add);
>>+
>>+/**
>>+ * dev_addr_del - Release a device address.
>>+ * @dev: device
>>+ * @addr: address to delete
>>+ *
>>+ * Release reference to a device address and remove it from the device
>>+ * if the reference count drops to zero.
>>+ *
>>+ * The caller must hold the rtnl_mutex.
>>+ */
>>+int dev_addr_del(struct net_device *dev, unsigned char *addr)
>>+{
>>+ int err;
>>+
>>+ ASSERT_RTNL();
>>+
>>+ err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
>>+ if (!err)
>>+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
>>+ return err;
>>+}
>>+EXPORT_SYMBOL(dev_addr_del);
>>+
>>+/**
>>+ * dev_addr_add_multiple - Add device addresses from another device
>>+ * @to_dev: device to which addresses will be added
>>+ * @from_dev: device from which addresses will be added
>>+ *
>>+ * Add device addresses of the one device to another.
>>+ *
>>+ * The caller must hold the rtnl_mutex.
>>+ */
>>+int dev_addr_add_multiple(struct net_device *to_dev,
>>+ struct net_device *from_dev)
>>+{
>>+ int err;
>>+
>>+ ASSERT_RTNL();
>>+
>>+ if (from_dev->addr_len != to_dev->addr_len)
>>+ return -EINVAL;
>>+ err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
>>+ &from_dev->dev_addr_list,
>>+ to_dev->addr_len, 0);
>>+ if (!err)
>>+ call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
>>+ return err;
>>+}
>>+EXPORT_SYMBOL(dev_addr_add_multiple);
>>+
>>+/**
>>+ * dev_addr_del_multiple - Delete device addresses by another device
>>+ * @to_dev: device where the addresses will be deleted
>>+ * @from_dev: device by which addresses the addresses will be deleted
>>+ *
>>+ * Deletes addresses in to device by the list of addresses in from device.
>>+ *
>>+ * The caller must hold the rtnl_mutex.
>>+ */
>>+int dev_addr_del_multiple(struct net_device *to_dev,
>>+ struct net_device *from_dev)
>>+{
>>+ ASSERT_RTNL();
>>+
>>+ if (from_dev->addr_len != to_dev->addr_len)
>>+ return -EINVAL;
>>+ __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
>>+ &from_dev->dev_addr_list,
>>+ to_dev->addr_len, 0);
>>+ call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
>>+ return 0;
>>+}
>>+EXPORT_SYMBOL(dev_addr_del_multiple);
>>+
>>+/* unicast and multicast addresses handling functions */
>>+
>> int __dev_addr_delete(struct dev_addr_list **list, int *count,
>> void *addr, int alen, int glbl)
>> {
>>@@ -4780,6 +5037,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
>>
>> dev->gso_max_size = GSO_MAX_SIZE;
>>
>>+ dev_addr_init(dev);
>> netdev_init_queues(dev);
>>
>> INIT_LIST_HEAD(&dev->napi_list);
>>@@ -4805,6 +5063,9 @@ void free_netdev(struct net_device *dev)
>>
>> kfree(dev->_tx);
>>
>>+ /* Flush device addresses */
>>+ dev_addr_flush(dev);
>>+
>> list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
>> netif_napi_del(p);
>>
>>--
>>1.6.0.6
>>