Linux netdev master LAG聚合与slave优先级切换
Linux netdev master LAG聚合与slave优先级切换Linux bonding/LAGLink Aggregation驱动位于drivers/net/bonding/通过bonding master设备聚合多个slave网卡提供负载均衡和链路冗余。核心数据模型基于netdev_master_upper_dev_link建立上层链路关系。bonding初始化时创建一个master net_device每个实际网卡作为slave加入。bond_enslave函数将slave绑定到masterint bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,struct netlink_ext_ack *extack){struct bonding *bond netdev_priv(bond_dev);struct slave *slave;int res;slave kzalloc(sizeof(struct slave), GFP_KERNEL);if (!slave)return -ENOMEM;slave-dev slave_dev;slave-bond bond;dev_hold(slave_dev);netdev_master_upper_dev_link(slave_dev, bond_dev, slave, NULL, NULL);if (bond-params.mode BOND_MODE_8023AD) {slave-ad_info.actor_system bond-params.ad_actor_system;slave-ad_info.aggregator_id 0;}res bond_set_slave_link_state(slave, BOND_LINK_UP,BR_STATE_NONE, NULL);if (res)goto err_unlink;if (bond-params.mode BOND_MODE_ACTIVEBACKUP) {if (!bond-curr_active_slave) {bond_set_active_slave(slave);bond-curr_active_slave slave;} else {bond_set_backup_slave(slave);}}slave-original_mtu slave_dev-mtu;slave-speed slave_dev-ethtool_ops ?slave_dev-ethtool_ops-get_link_ksettings ?slave_dev-ethtool_ops-get_link_ksettings(...): SPEED_UNKNOWN : SPEED_UNKNOWN;if (bond-params.mode BOND_MODE_ACTIVEBACKUP ||bond-params.mode BOND_MODE_ROUNDROBIN ||bond-params.mode BOND_MODE_BALANCE_RR)bond_set_slave_inactive_flags(slave, BOND_SLAVE_NOTIFY_NOW);bond_update_slave_arr(bond, NULL);return 0;err_unlink:netdev_upper_dev_unlink(slave_dev, bond_dev);dev_put(slave_dev);kfree(slave);return res;}主备模式active-backup下slave优先级切换由bond_select_active_slave决定struct slave *bond_select_active_slave(struct bonding *bond){struct slave *best_slave NULL;struct list_head *iter;struct slave *slave;int best_priority -1;bond_for_each_slave(bond, slave, iter) {if (!bond_slave_can_tx(slave))continue;if (slave-link BOND_LINK_DOWN)continue;if (!bond_has_lii(bond, slave))continue;if (slave-priority best_priority) {best_priority slave-priority;best_slave slave;}}if (best_slave)bond_set_active_slave(best_slave);return best_slave;}slave优先级通过netlink属性IFLA_BOND_SLAVE_PRIORITY设置默认值为0。优先级高的slave优先成为active。当active slave的link down时bonding在bond_mii_monitor中检测到状态变化触发failoverstatic void bond_mii_monitor(struct work_struct *work){struct bonding *bond container_of(work, struct bonding,mii_work.work);struct slave *slave, *oldactive;bool should_notify_peers false;bool commit false;read_lock(bond-lock);oldactive bond-curr_active_slave;bond_for_each_slave(bond, slave, iter) {int link_state bond_check_dev_link(bond, slave-dev, 0);if (link_state ! slave-link) {slave-link link_state;if (link_state BOND_LINK_DOWN) {if (slave bond-curr_active_slave) {commit true;}}}}if (commit bond-curr_active_slave bond-curr_active_slave-link BOND_LINK_DOWN) {bond_set_slave_inactive_flags(bond-curr_active_slave,BOND_SLAVE_NOTIFY_NOW);bond_select_active_slave(bond);}read_unlock(bond-lock);if (bond-curr_active_slave ! oldactive) {netdev_cmd_send_sw(bond-dev, NETDEV_BONDING_FAILOVER,NULL, NULL);should_notify_peers true;}if (should_notify_peers)netdev_notify_peers(bond-dev);queue_delayed_work(bond-wq, bond-mii_work, msecs_to_jiffies(100));}发送路径上bond_start_xmit根据模式选择slave发送。802.3ad模式下使用哈希选择slavestatic netdev_tx_t bond_start_xmit(struct sk_buff *skb,struct net_device *dev){struct bonding *bond netdev_priv(dev);if (bond_is_np(bond)) {return bond_3ad_xor_xmit(skb, dev);}switch (bond-params.mode) {case BOND_MODE_ROUNDROBIN:return bond_xmit_roundrobin(skb, dev);case BOND_MODE_ACTIVEBACKUP:return bond_xmit_activebackup(skb, dev);case BOND_MODE_XOR:return bond_3ad_xor_xmit(skb, dev);case BOND_MODE_BROADCAST:return bond_xmit_broadcast(skb, dev);case BOND_MODE_8023AD:return bond_3ad_xor_xmit(skb, dev);case BOND_MODE_ALB:return bond_alb_xmit(skb, dev);case BOND_MODE_TLB:return bond_tlb_xmit(skb, dev);default:return dev_queue_xmit(skb);}}active-backup模式下bond_xmit_activebackup的实现在当前active slave上发送static netdev_tx_t bond_xmit_activebackup(struct sk_buff *skb,struct net_device *bond_dev){struct bonding *bond netdev_priv(bond_dev);struct slave *slave;slave bond-curr_active_slave;if (slave) {if (bond_slave_can_tx(slave))return bond_dev_queue_xmit(bond, skb, slave-dev);}bond-dev-stats.tx_dropped;dev_kfree_skb(skb);return NETDEV_TX_OK;}slave数组的维护通过bond_update_slave_arr完成。每次slave状态变化时重新构建void bond_update_slave_arr(struct bonding *bond, struct slave *skipslave){struct slave *slave;struct list_head *iter;struct bond_up_slave *new_arr;int slaves_in_agg;if (bond-params.mode BOND_MODE_8023AD)slaves_in_agg bond_3ad_get_active_agg_slave_num(bond);elseslaves_in_agg bond-slave_cnt;new_arr kmalloc(struct_size(new_arr, arr, slaves_in_agg), GFP_ATOMIC);if (!new_arr)return;new_arr-count 0;bond_for_each_slave(bond, slave, iter) {if (bond_slave_can_tx(slave))new_arr-arr[new_arr-count] slave;}rcu_assign_pointer(bond-slave_arr, new_arr);synchronize_rcu();// old array will be freed by RCU callback}LACP802.3ad模式下slave优先级通过actor_port_priority和partner的system priority联合决定slave在aggregator中的激活顺序。bond_3ad_state_machine_handler周期性处理LACPDU协议报文更新聚合组状态动态调整slave激活集合。