程序師世界是廣大編程愛好者互助、分享、學習的平台,程序師世界有你更精彩!
首頁
編程語言
C語言|JAVA編程
Python編程
網頁編程
ASP編程|PHP編程
JSP編程
數據庫知識
MYSQL數據庫|SqlServer數據庫
Oracle數據庫|DB2數據庫
 程式師世界 >> 編程語言 >> 網頁編程 >> PHP編程 >> 關於PHP編程 >> Linux 3.10 kernel bridge轉發邏輯

Linux 3.10 kernel bridge轉發邏輯

編輯:關於PHP編程

Linux 3.10 kernel bridge轉發邏輯


Linux 3.10 kernel bridge轉發邏輯

——lvyilong316

之前分析過linux kernel 2.6.32的bridge轉發邏輯,下面分析一下linux kernel 3.10的bridge轉發邏輯。這樣正是CentOS 5和CentOS 7對應的內核。3.10 kernel中bridge邏輯的最大改變就是增加了vlan處理邏輯以及brdige入口函數的設置。

1. netdev_rx_handler_register

在分析之前首先要介紹一個重要函數:netdev_rx_handler_register,這個函數是2.6內核所沒有的。

lnetdev_rx_handler_register


  1. /*
  2. * dev: 要注冊接收函數的dev
  3. * rx_handler: 要注冊的接收函數
  4. * rx_handler_data: 指向rx_handler_data使用的數據
  5. */
  6. int netdev_rx_handler_register(struct net_device *dev,
  7. rx_handler_func_t *rx_handler,
  8. void *rx_handler_data)
  9. {
  10. ASSERT_RTNL();

  11. if (dev->rx_handler)
  12. return -EBUSY;

  13. /* Note: rx_handler_data must be set before rx_handler */
  14. rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
  15. rcu_assign_pointer(dev->rx_handler, rx_handler);

  16. return 0;
  17. }


這個函數可以給設備(net_device)注冊接收函數,然後在__netif_receive_skb函數中根據接收skb的設備接口,再調用這個被注冊的接收函數。比如為網橋下的接口注冊br_handle_frame函數,為bonding接口注冊bond_handle_frame函數。這相對於老式的網橋處理更靈活,有了這個機制也可以在模塊中自行注冊處理函數。比如3.10中的openvswitch(OpenvSwitch在3.10已經合入了內核)創建netdev vport的函數netdev_create。

lnetdev_create


  1. static struct vport *netdev_create(const struct vport_parms *parms)
  2. {
  3. struct vport *vport;
  4. /....../
  5. err = netdev_rx_handler_register(netdev_vport->dev, netdev_frame_hook,vport);
  6. /....../
  7. }


這個函數在創建netdev vport時將設備的接收函數設置為netdev_frame_hook函數,這也是整個openvswitch的入口函數,如果查看OpenvSwitch的源碼可以看到當安裝於2.6內核時這裡是替換掉bridge的br_handle_frame_hook函數,從而由bridge邏輯進入OpenvSwitch邏輯。

2. Bridge轉發邏輯分析

還是先從netif_receive_skb函數分析,這個函數算是進入協議棧的入口。

lnetif_receive_skb


  1. int netif_receive_skb(struct sk_buff *skb)
  2. {
  3. int ret;
  4. if (skb_defer_rx_timestamp(skb))
  5. return NET_RX_SUCCESS;
  6. rcu_read_lock();
  7. /*RPS邏輯處理,現在內核中使用了RPS機制, 將報文分散到各個cpu的接收隊列中進行負載均衡處理*/
  8. #ifdef CONFIG_RPS
  9. if (static_key_false(&rps_needed)) {
  10. struct rps_dev_flow voidflow, *rflow = &voidflow;
  11. int cpu = get_rps_cpu(skb->dev, skb, &rflow);
  12. if (cpu >= 0) {
  13. ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
  14. rcu_read_unlock();
  15. return ret;
  16. }
  17. }
  18. #endif
  19. ret = __netif_receive_skb(skb);
  20. rcu_read_unlock();
  21. return ret;
  22. }


netif_receive_skb只是對數據包進行了RPS的處理,然後調用__netif_receive_skb。

__netif_receive_skb並沒有其他多余的處理邏輯,主要調用 __netif_receive_skb_core,這個函數才真正相當於2.6內核的netif_receive_skb。以下代碼省略了和bridge無關的邏輯。

l__netif_receive_skb_core


  1. static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
  2. {
  3. struct packet_type *ptype, *pt_prev;
  4. rx_handler_func_t *rx_handler;
  5. struct net_device *orig_dev;
  6. struct net_device *null_or_dev;
  7. bool deliver_exact = false;
  8. int ret = NET_RX_DROP;
  9. __be16 type;
  10. /*......*/
  11. orig_dev = skb->dev;
  12. skb_reset_network_header(skb);
  13. pt_prev = NULL;
  14. skb->skb_iif = skb->dev->ifindex;
  15. /*ptype_all協議處理,tcpdump抓包就在這裡*/
  16. list_for_each_entry_rcu(ptype, &ptype_all, list) {
  17. if (!ptype->dev || ptype->dev == skb->dev) {
  18. if (pt_prev)
  19. ret = deliver_skb(skb, pt_prev, orig_dev);
  20. pt_prev = ptype;
  21. }
  22. }
  23. /*調用接收設備的rx_handler*/
  24. rx_handler = rcu_dereference(skb->dev->rx_handler);
  25. if (rx_handler) {
  26. if (pt_prev) {
  27. ret = deliver_skb(skb, pt_prev, orig_dev);
  28. pt_prev = NULL;
  29. }
  30. switch (rx_handler(&skb)) {
  31. case RX_HANDLER_CONSUMED:
  32. ret = NET_RX_SUCCESS;
  33. goto out;
  34. case RX_HANDLER_ANOTHER:
  35. goto another_round;
  36. case RX_HANDLER_EXACT:
  37. deliver_exact = true;
  38. case RX_HANDLER_PASS:
  39. break;
  40. default:
  41. BUG();
  42. }
  43. }
  44. /*根據 skb->protocol傳遞給上層協議*/
  45. type = skb->protocol;
  46. list_for_each_entry_rcu(ptype,&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
  47. if (ptype->type == type &&(ptype->dev == null_or_dev || ptype->dev == skb->dev ||ptype->dev == orig_dev)) {
  48. if (pt_prev)
  49. ret = deliver_skb(skb, pt_prev, orig_dev);
  50. pt_prev = ptype;
  51. }
  52. }
  53. if (pt_prev) {
  54. if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
  55. goto drop;
  56. else
  57. ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
  58. } else {
  59. drop:
  60. atomic_long_inc(&skb->dev->rx_dropped);
  61. kfree_skb(skb);
  62. ret = NET_RX_DROP;
  63. }
  64. out:
  65. return ret;
  66. }


如果一個dev被添加到一個bridge(做為bridge的一個接口),的這個接口設備的rx_handler被設置為br_handle_frame函數,這是在br_add_if函數中設置的,而br_add_if (net/bridge/br_if.c)是在向網橋設備上添加接口時設置的。進入br_handle_frame也就進入了bridge的邏輯代碼。

lbr_add_if


  1. int br_add_if(struct net_bridge *br, struct net_device *dev)
  2. {
  3. /*......*/
  4. err = netdev_rx_handler_register(dev, br_handle_frame, p);
  5. /*......*/
  6. }


lbr_handle_frame


  1. rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
  2. {
  3. struct net_bridge_port *p;
  4. struct sk_buff *skb = *pskb;
  5. const unsigned char *dest = eth_hdr(skb)->h_dest;
  6. br_should_route_hook_t *rhook;
  7. if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
  8. return RX_HANDLER_PASS;
  9. if (!is_valid_ether_addr(eth_hdr(skb)->h_source))
  10. goto drop;
  11. skb = skb_share_check(skb, GFP_ATOMIC);
  12. if (!skb)
  13. return RX_HANDLER_CONSUMED;
  14. /*獲取dev對應的bridge port*/
  15. p = br_port_get_rcu(skb->dev);
  16. /*特殊目的mac地址的處理*/
  17. if (unlikely(is_link_local_ether_addr(dest))) {
  18. /*
  19. * See IEEE 802.1D Table 7-10 Reserved addresses
  20. *
  21. * Assignment Value
  22. * Bridge Group Address 01-80-C2-00-00-00
  23. * (MAC Control) 802.3 01-80-C2-00-00-01
  24. * (Link Aggregation) 802.3 01-80-C2-00-00-02
  25. * 802.1X PAE address 01-80-C2-00-00-03
  26. *
  27. * 802.1AB LLDP 01-80-C2-00-00-0E
  28. *
  29. * Others reserved for future standardization
  30. */
  31. switch (dest[5]) {
  32. case 0x00: /* Bridge Group Address */
  33. /* If STP is turned off,then must forward to keep loop detection */
  34. if (p->br->stp_enabled == BR_NO_STP)
  35. goto forward;
  36. break;
  37. case 0x01: /* IEEE MAC (Pause) */
  38. goto drop;
  39. default:
  40. /* Allow selective forwarding for most other protocols */
  41. if (p->br->group_fwd_mask & (1u << dest[5]))
  42. goto forward;
  43. }
  44. /* LOCAL_IN hook點,注意經過這個hook點並不代表發送到主機協議棧(只有特殊目的mac01-80-C2才會走到這裡)*/
  45. if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, skb, skb->dev,
  46. NULL, br_handle_local_finish)) {
  47. return RX_HANDLER_CONSUMED; /* consumed by filter */
  48. } else {
  49. *pskb = skb;
  50. return RX_HANDLER_PASS; /* continue processing */
  51. }
  52. }
  53. /*轉發邏輯*/
  54. forward:
  55. switch (p->state) {
  56. case BR_STATE_FORWARDING:
  57. rhook = rcu_dereference(br_should_route_hook);
  58. if (rhook) {
  59. if ((*rhook)(skb)) {
  60. *pskb = skb;
  61. return RX_HANDLER_PASS;
  62. }
  63. dest = eth_hdr(skb)->h_dest;
  64. }
  65. /* fall through */
  66. case BR_STATE_LEARNING:
  67. /*skb的目的mac和bridge的mac一樣,則將skb發往本機協議棧*/
  68. if (ether_addr_equal(p->br->dev->dev_addr, dest))
  69. skb->pkt_type = PACKET_HOST;
  70. /*NF_BR_PRE_ROUTING hook點*/
  71. NF_HOOK(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL,br_handle_frame_finish);
  72. break;
  73. default:
  74. drop:
  75. kfree_skb(skb);
  76. }
  77. return RX_HANDLER_CONSUMED;
  78. }


經過NF_BR_LOCAL_IN hook點會執行br_handle_local_finish函數。

lbr_handle_local_finish


  1. static int br_handle_local_finish(struct sk_buff *skb)
  2. {
  3. struct net_bridge_port *p = br_port_get_rcu(skb->dev);
  4. u16 vid = 0;
  5. /*獲取skb的vlan id(3.10的bridge支持vlan)*/
  6. br_vlan_get_tag(skb, &vid);
  7. /*更新bridge的mac表,注意vlan id也是參數,說明每個vlan有一個獨立的mac表*/
  8. br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid);
  9. return 0; /* process further */
  10. }


經過NF_BR_PRE_ROUTING hook點會執行br_handle_frame_finish函數。

lbr_handle_frame_finish


  1. int br_handle_frame_finish(struct sk_buff *skb)
  2. {
  3. const unsigned char *dest = eth_hdr(skb)->h_dest;
  4. struct net_bridge_port *p = br_port_get_rcu(skb->dev);
  5. struct net_bridge *br;
  6. struct net_bridge_fdb_entry *dst;
  7. struct net_bridge_mdb_entry *mdst;
  8. struct sk_buff *skb2;
  9. u16 vid = 0;
  10. if (!p || p->state == BR_STATE_DISABLED)
  11. goto drop;
  12. /*這個判斷主要是vlan的相關檢查,如是否和接收接口配置的vlan相同*/
  13. if (!br_allowed_ingress(p->br, nbp_get_vlan_info(p), skb, &vid))
  14. goto out;
  15. /* insert into forwarding database after filtering to avoid spoofing */
  16. br = p->br;
  17. /*更新轉發數據庫*/
  18. br_fdb_update(br, p, eth_hdr(skb)->h_source, vid);
  19. /*多播mac的處理*/
  20. if (!is_broadcast_ether_addr(dest) && is_multicast_ether_addr(dest) &&
  21. br_multicast_rcv(br, p, skb))
  22. goto drop;
  23. if (p->state == BR_STATE_LEARNING)
  24. goto drop;
  25. BR_INPUT_SKB_CB(skb)->brdev = br->dev;
  26. /* The packet skb2 goes to the local host (NULL to skip). */
  27. skb2 = NULL;
  28. /*如果網橋被設置為混雜模式*/
  29. if (br->dev->flags & IFF_PROMISC)
  30. skb2 = skb;
  31. dst = NULL;
  32. /*如果skb的目的mac是廣播*/
  33. if (is_broadcast_ether_addr(dest))
  34. skb2 = skb;
  35. else if (is_multicast_ether_addr(dest)) { /*多播*/
  36. mdst = br_mdb_get(br, skb, vid);
  37. if (mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) {
  38. if ((mdst && mdst->mglist) ||
  39. br_multicast_is_router(br))
  40. skb2 = skb;
  41. br_multicast_forward(mdst, skb, skb2);
  42. skb = NULL;
  43. if (!skb2)
  44. goto out;
  45. } else
  46. skb2 = skb;
  47. br->dev->stats.multicast++;
  48. } else if ((dst = __br_fdb_get(br, dest, vid)) &&dst->is_local) {/*目的地址是本機mac,則發往本機協議棧*/
  49. skb2 = skb;
  50. /* Do not forward the packet since it's local. */
  51. skb = NULL;
  52. }
  53. if (skb) {
  54. if (dst) {
  55. dst->used = jiffies;
  56. br_forward(dst->dst, skb, skb2); //轉發給目的接口
  57. } else
  58. br_flood_forward(br, skb, skb2); //找不到目的接口則廣播
  59. }
  60. if (skb2)
  61. return br_pass_frame_up(skb2); //發往本機協議棧
  62. out:
  63. return 0;
  64. drop:
  65. kfree_skb(skb);
  66. goto out;
  67. }


我們先看發往本機協議棧的函數br_pass_frame_up。

lbr_pass_frame_up

  1. static int br_pass_frame_up(struct sk_buff *skb)
  2. {
  3. struct net_device *indev, *brdev = BR_INPUT_SKB_CB(skb)->brdev;
  4. struct net_bridge *br = netdev_priv(brdev);
  5. //更新統計計數(略)
  6. /* Bridge is just like any other port. Make sure the
  7. * packet is allowed except in promisc modue when someone
  8. * may be running packet capture.
  9. */
  10. if (!(brdev->flags & IFF_PROMISC) &&!br_allowed_egress(br, br_get_vlan_info(br), skb)) {
  11. kfree_skb(skb); //如果不是混雜模式且vlan處理不合要求則丟棄
  12. return NET_RX_DROP;
  13. }
  14. //vlan處理邏輯
  15. skb = br_handle_vlan(br, br_get_vlan_info(br), skb);
  16. if (!skb)
  17. return NET_RX_DROP;
  18. indev = skb->dev;
  19. skb->dev = brdev; //重點,這裡修改了skb->dev為bridge
  20. //經過NF_BR_LOCAL_IN再次進入協議棧
  21. return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, skb, indev, NULL,
  22. netif_receive_skb);
  23. }

再次進入netif_receive_skb,由於skb-dev被設置成了bridge,而bridge設備的rx_handler函數是沒有被設置的,所以就不會再次進入bridge邏輯,而直接進入了主機上層協議棧。

下面看轉發邏輯,轉發邏輯主要在br_forward函數中,而br_forward主要調用__br_forward函數。

l__br_forward


  1. static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb)
  2. {
  3. struct net_device *indev;
  4. //vlan處理
  5. skb = br_handle_vlan(to->br, nbp_get_vlan_info(to), skb);
  6. if (!skb)
  7. return;
  8. indev = skb->dev;
  9. skb->dev = to->dev; //skb->dev設置為出口設備dev
  10. skb_forward_csum(skb);
  11. //經過NF_BR_FORWARD hook點,調用br_forward_finish
  12. NF_HOOK(NFPROTO_BRIDGE, NF_BR_FORWARD, skb, indev, skb->dev,
  13. br_forward_finish);
  14. }

lbr_forward_finish

  1. int br_forward_finish(struct sk_buff *skb)
  2. {
  3. //經過NF_BR_POST_ROUTING hook點,調用br_dev_queue_push_xmit
  4. return NF_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING, skb, NULL, skb->dev,br_dev_queue_push_xmit);
  5. }

lbr_dev_queue_push_xmit

  1. int br_dev_queue_push_xmit(struct sk_buff *skb)
  2. {
  3. /* ip_fragment doesn't copy the MAC header */
  4. if (nf_bridge_maybe_copy_header(skb) ||(packet_length(skb) > skb->dev->mtu && !skb_is_gso(skb))) {
  5. kfree_skb(skb);
  6. } else {
  7. skb_push(skb, ETH_HLEN);
  8. br_drop_fake_rtable(skb);
  9. dev_queue_xmit(skb); //發送到鏈路層
  10. }
  11. return 0;
  12. }

Skb進入dev_queue_xmit就會調用相應設備驅動的發送函數。也就出了bridge邏輯。所以整個3.10kernel的bridge轉發邏輯如下圖所示:

注意,和2.6kernel一樣,bridge的OUTPUT hook點在bridge dev的發送函數中,這裡不再分析列出。

  1. 上一頁:
  2. 下一頁:
Copyright © 程式師世界 All Rights Reserved