[lustre-discuss] lnet configuration messed up when clients mount lustre

Riccardo Veraldi Riccardo.Veraldi at cnaf.infn.it
Thu Apr 19 19:18:41 PDT 2018


Hello,
I have on my OSSes and on my clients the lnet configuration is loaded at
boot time form lnet.conf
I define local interfaces and peers.
What happens is that when the lustre filesystems are mounted by the
clients lnet is modified both on client and OSS side  and tcp peers are
added at the end
of the lnet configuration and this has as a consequence that all traffic
starts to go through TCP and not infiniband.
I am using RHEL74 and Lustre 2.10.3 my configuration si a bit not common
because at the same time I use kernel 4.4 on the servers while all the
clients are stock RHEL74 kernel.

Follows Lnet yaml configuration before client mounting lustre and after
client mounting lustre partitions.

seems like that auto peer discovering is overriding ib and using just tcp.
is ther a way to stop peer auto discovery ? or a way to tell that ib has
precedence over tcp ?

lnet configuread at boot:

net:
    - net type: lo
      local NI(s):
        - nid: 0 at lo
          status: up
          statistics:
              send_count: 0
              recv_count: 0
              drop_count: 0
          tunables:
              peer_timeout: 0
              peer_credits: 0
              peer_buffer_credits: 0
              credits: 0
          lnd tunables:
          tcp bonding: 0
          dev cpt: 0
          CPT: "[0,1]"
    - net type: o2ib
      local NI(s):
        - nid: 172.21.52.84 at o2ib
          status: up
          interfaces:
              0: ib0
          statistics:
              send_count: 96252389
              recv_count: 61558248
              drop_count: 0
          tunables:
              peer_timeout: 180
              peer_credits: 128
              peer_buffer_credits: 0
              credits: 1024
          lnd tunables:
              peercredits_hiw: 64
              map_on_demand: 32
              concurrent_sends: 256
              fmr_pool_size: 2048
              fmr_flush_trigger: 512
              fmr_cache: 1
              ntx: 2048
              conns_per_peer: 4
          tcp bonding: 0
          dev cpt: 1
          CPT: "[0,1]"
        - nid: 172.21.52.116 at o2ib
          status: up
          interfaces:
              0: ib1
          statistics:
              send_count: 96253070
              recv_count: 61558217
              drop_count: 0
          tunables:
              peer_timeout: 180
              peer_credits: 128
              peer_buffer_credits: 0
              credits: 1024
          lnd tunables:
              peercredits_hiw: 64
              map_on_demand: 32
              concurrent_sends: 256
              fmr_pool_size: 2048
              fmr_flush_trigger: 512
              fmr_cache: 1
              ntx: 2048
              conns_per_peer: 4
          tcp bonding: 0
          dev cpt: 1
          CPT: "[0,1]"
    - net type: tcp
      local NI(s):
        - nid: 172.21.42.207 at tcp
          status: up
          interfaces:
              0: enp1s0f0
          statistics:
              send_count: 380697
              recv_count: 380352
              drop_count: 0
          tunables:
              peer_timeout: 180
              peer_credits: 8
              peer_buffer_credits: 0
              credits: 256
          lnd tunables:
          tcp bonding: 0
          dev cpt: 0
          CPT: "[0,1]"
peer:
    - primary nid: 172.21.42.159 at tcp
      Multi-Rail: True
      peer ni:
        - nid: 172.21.42.159 at tcp
          state: NA
          max_ni_tx_credits: 8
          available_tx_credits: 8
          min_tx_credits: 0
          tx_q_num_of_buf: 0
          available_rtr_credits: 8
          min_rtr_credits: 8
          send_count: 380697
          recv_count: 380352
          drop_count: 0
          refcount: 1
    - primary nid: 172.21.52.126 at o2ib
      Multi-Rail: True
      peer ni:
        - nid: 172.21.52.126 at o2ib
          state: NA
          max_ni_tx_credits: 128
          available_tx_credits: 128
          min_tx_credits: -7
          tx_q_num_of_buf: 0
          available_rtr_credits: 128
          min_rtr_credits: 128
          send_count: 28134533
          recv_count: 8553649
          drop_count: 0
          refcount: 1
    - primary nid: 172.21.52.127 at o2ib
      Multi-Rail: True
      peer ni:
        - nid: 172.21.52.127 at o2ib
          state: NA
          max_ni_tx_credits: 128
          available_tx_credits: 128
          min_tx_credits: 97
          tx_q_num_of_buf: 0
          available_rtr_credits: 128
          min_rtr_credits: 128
          send_count: 13505518
          recv_count: 6106498
          drop_count: 0
          refcount: 1
    - primary nid: 172.21.52.128 at o2ib
      Multi-Rail: True
      peer ni:
        - nid: 172.21.52.128 at o2ib
          state: NA
          max_ni_tx_credits: 128
          available_tx_credits: 128
          min_tx_credits: -751
          tx_q_num_of_buf: 0
          available_rtr_credits: 128
          min_rtr_credits: 128
          send_count: 17672565
          recv_count: 13195155
          drop_count: 0
          refcount: 1
    - primary nid: 172.21.52.129 at o2ib
      Multi-Rail: True
      peer ni:
        - nid: 172.21.52.129 at o2ib
          state: NA
          max_ni_tx_credits: 128
          available_tx_credits: 128
          min_tx_credits: -369
          tx_q_num_of_buf: 0
          available_rtr_credits: 128
          min_rtr_credits: 128
          send_count: 13934795
          recv_count: 11409629
          drop_count: 0
          refcount: 1
    - primary nid: 172.21.52.130 at o2ib
      Multi-Rail: True
      peer ni:
        - nid: 172.21.52.130 at o2ib
          state: NA
          max_ni_tx_credits: 128
          available_tx_credits: 128
          min_tx_credits: -458
          tx_q_num_of_buf: 0
          available_rtr_credits: 128
          min_rtr_credits: 128
          send_count: 12257935
          recv_count: 11907534
          drop_count: 0
          refcount: 1
    - primary nid: 172.21.52.131 at o2ib
      Multi-Rail: True
      peer ni:
        - nid: 172.21.52.131 at o2ib
          state: NA
          max_ni_tx_credits: 128
          available_tx_credits: 128
          min_tx_credits: -417
          tx_q_num_of_buf: 0
          available_rtr_credits: 128
          min_rtr_credits: 128
          send_count: 10748675
          recv_count: 10384163
          drop_count: 0
          refcount: 1

when then clients mount the lustre partitions Lnet is modified:

net:
    - net type: lo
      local NI(s):
        - nid: 0 at lo
          status: up
          statistics:
              send_count: 0
              recv_count: 0
              drop_count: 0
          tunables:
              peer_timeout: 0
              peer_credits: 0
              peer_buffer_credits: 0
              credits: 0
          lnd tunables:
          tcp bonding: 0
          dev cpt: 0
          CPT: "[0,1]"
    - net type: o2ib
      local NI(s):
        - nid: 172.21.52.84 at o2ib
          status: up
          interfaces:
              0: ib0
          statistics:
              send_count: 0
              recv_count: 0
              drop_count: 0
          tunables:
              peer_timeout: 180
              peer_credits: 128
              peer_buffer_credits: 0
              credits: 1024
          lnd tunables:
              peercredits_hiw: 64
              map_on_demand: 32
              concurrent_sends: 256
              fmr_pool_size: 2048
              fmr_flush_trigger: 512
              fmr_cache: 1
              ntx: 2048
              conns_per_peer: 4
          tcp bonding: 0
          dev cpt: 1
          CPT: "[0,1]"
        - nid: 172.21.52.116 at o2ib
          status: up
          interfaces:
              0: ib1
          statistics:
              send_count: 0
              recv_count: 0
              drop_count: 0
          tunables:
              peer_timeout: 180
              peer_credits: 128
              peer_buffer_credits: 0
              credits: 1024
          lnd tunables:
              peercredits_hiw: 64
              map_on_demand: 32
              concurrent_sends: 256
              fmr_pool_size: 2048
              fmr_flush_trigger: 512
              fmr_cache: 1
              ntx: 2048
              conns_per_peer: 4
          tcp bonding: 0
          dev cpt: 1
          CPT: "[0,1]"
    - net type: tcp
      local NI(s):
        - nid: 172.21.42.207 at tcp
          status: up
          interfaces:
              0: enp1s0f0
          statistics:
              send_count: 646
              recv_count: 646
              drop_count: 0
          tunables:
              peer_timeout: 180
              peer_credits: 8
              peer_buffer_credits: 0
              credits: 256
          lnd tunables:
          tcp bonding: 0
          dev cpt: 0
          CPT: "[0,1]"
peer:
    - primary nid: 172.21.42.159 at tcp
      Multi-Rail: True
      peer ni:
        - nid: 172.21.42.159 at tcp
          state: NA
          max_ni_tx_credits: 8
          available_tx_credits: 8
          min_tx_credits: 6
          tx_q_num_of_buf: 0
          available_rtr_credits: 8
          min_rtr_credits: 8
          send_count: 268
          recv_count: 268
          drop_count: 0
          refcount: 1
    - primary nid: 172.21.52.126 at o2ib
      Multi-Rail: True
      peer ni:
        - nid: 172.21.52.126 at o2ib
          state: NA
          max_ni_tx_credits: 128
          available_tx_credits: 128
          min_tx_credits: 128
          tx_q_num_of_buf: 0
          available_rtr_credits: 128
          min_rtr_credits: 128
          send_count: 0
          recv_count: 0
          drop_count: 0
          refcount: 1
    - primary nid: 172.21.52.127 at o2ib
      Multi-Rail: True
      peer ni:
        - nid: 172.21.52.127 at o2ib
          state: NA
          max_ni_tx_credits: 128
          available_tx_credits: 128
          min_tx_credits: 128
          tx_q_num_of_buf: 0
          available_rtr_credits: 128
          min_rtr_credits: 128
          send_count: 0
          recv_count: 0
          drop_count: 0
          refcount: 1
    - primary nid: 172.21.52.128 at o2ib
      Multi-Rail: True
      peer ni:
        - nid: 172.21.52.128 at o2ib
          state: NA
          max_ni_tx_credits: 128
          available_tx_credits: 128
          min_tx_credits: 128
          tx_q_num_of_buf: 0
          available_rtr_credits: 128
          min_rtr_credits: 128
          send_count: 0
          recv_count: 0
          drop_count: 0
          refcount: 1
    - primary nid: 172.21.52.129 at o2ib
      Multi-Rail: True
      peer ni:
        - nid: 172.21.52.129 at o2ib
          state: NA
          max_ni_tx_credits: 128
          available_tx_credits: 128
          min_tx_credits: 128
          tx_q_num_of_buf: 0
          available_rtr_credits: 128
          min_rtr_credits: 128
          send_count: 0
          recv_count: 0
          drop_count: 0
          refcount: 1
    - primary nid: 172.21.52.130 at o2ib
      Multi-Rail: True
      peer ni:
        - nid: 172.21.52.130 at o2ib
          state: NA
          max_ni_tx_credits: 128
          available_tx_credits: 128
          min_tx_credits: 128
          tx_q_num_of_buf: 0
          available_rtr_credits: 128
          min_rtr_credits: 128
          send_count: 0
          recv_count: 0
          drop_count: 0
          refcount: 1
    - primary nid: 172.21.52.131 at o2ib
      Multi-Rail: True
      peer ni:
        - nid: 172.21.52.131 at o2ib
          state: NA
          max_ni_tx_credits: 128
          available_tx_credits: 128
          min_tx_credits: 128
          tx_q_num_of_buf: 0
          available_rtr_credits: 128
          min_rtr_credits: 128
          send_count: 0
          recv_count: 0
          drop_count: 0
          refcount: 1
    - primary nid: 172.21.42.224 at tcp
      Multi-Rail: False
      peer ni:
        - nid: 172.21.42.224 at tcp
          state: NA
          max_ni_tx_credits: 8
          available_tx_credits: 8
          min_tx_credits: 7
          tx_q_num_of_buf: 0
          available_rtr_credits: 8
          min_rtr_credits: 8
          send_count: 101
          recv_count: 101
          drop_count: 0
          refcount: 1
    - primary nid: 172.21.42.221 at tcp
      Multi-Rail: False
      peer ni:
        - nid: 172.21.42.221 at tcp
          state: NA
          max_ni_tx_credits: 8
          available_tx_credits: 8
          min_tx_credits: 7
          tx_q_num_of_buf: 0
          available_rtr_credits: 8
          min_rtr_credits: 8
          send_count: 20
          recv_count: 20
          drop_count: 0
          refcount: 1
    - primary nid: 172.21.42.202 at tcp
      Multi-Rail: False
      peer ni:
        - nid: 172.21.42.202 at tcp
          state: NA
          max_ni_tx_credits: 8
          available_tx_credits: 8
          min_tx_credits: 7
          tx_q_num_of_buf: 0
          available_rtr_credits: 8
          min_rtr_credits: 8
          send_count: 20
          recv_count: 20
          drop_count: 0
          refcount: 1
    - primary nid: 172.21.42.223 at tcp
      Multi-Rail: False
      peer ni:
        - nid: 172.21.42.223 at tcp
          state: NA
          max_ni_tx_credits: 8
          available_tx_credits: 8
          min_tx_credits: 7
          tx_q_num_of_buf: 0
          available_rtr_credits: 8
          min_rtr_credits: 8
          send_count: 197
          recv_count: 197
          drop_count: 0
          refcount: 1
    - primary nid: 172.21.42.222 at tcp
      Multi-Rail: False
      peer ni:
        - nid: 172.21.42.222 at tcp
          state: NA
          max_ni_tx_credits: 8
          available_tx_credits: 8
          min_tx_credits: 7
          tx_q_num_of_buf: 0
          available_rtr_credits: 8
          min_rtr_credits: 8
          send_count: 20
          recv_count: 20
          drop_count: 0
          refcount: 1
    - primary nid: 172.21.42.201 at tcp
      Multi-Rail: False
      peer ni:
        - nid: 172.21.42.201 at tcp
          state: NA
          max_ni_tx_credits: 8
          available_tx_credits: 8
          min_tx_credits: 7
          tx_q_num_of_buf: 0
          available_rtr_credits: 8
          min_rtr_credits: 8
          send_count: 20
          recv_count: 20
          drop_count: 0
          refcount: 1
numa:
    range: 0



More information about the lustre-discuss mailing list