<html>
  <head>
    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
  </head>
  <body text="#000000" bgcolor="#FFFFFF">
    <div class="moz-cite-prefix">it is not exactly this problem.</div>
    <div class="moz-cite-prefix"><br>
    </div>
    <div class="moz-cite-prefix">here is my setup<br>
    </div>
    <div class="moz-cite-prefix">
      <ul>
        <li>MDS is on tcp0</li>
        <li>client is on tcp0 and o2ib0</li>
        <li>OSS is on tcp0 and o2ib0<br>
        </li>
      </ul>
    </div>
    <div class="moz-cite-prefix">The problem is that the MDS is
      discovering both the lustre client and the OSS as well over o2ib
      and it should not because the MDS has only one ethernet interface.
      I can see this from lnetctl peer show. This did not happen prior
      to upgrading to Lustre 2.12.0 from 2.10.5</div>
    <div class="moz-cite-prefix"><br>
    </div>
    <div class="moz-cite-prefix">so I tried to debug with lctl ping from
      the MDS to the lustre client</div>
    <div class="moz-cite-prefix"><br>
    </div>
    <div class="moz-cite-prefix">[root@psmdsana1501 ~]# lctl ping
      172.21.48.250@tcp<br>
      12345-0@lo<br>
      12345-172.21.52.88@o2ib<br>
      12345-172.21.48.250@tcp</div>
    <div class="moz-cite-prefix"><br>
    </div>
    <div class="moz-cite-prefix">172.21.52.88 is the ib interface of the
      lustre client.</div>
    <div class="moz-cite-prefix"><br>
    </div>
    <div class="moz-cite-prefix">so I did <br>
    </div>
    <div class="moz-cite-prefix"><br>
    </div>
    <div class="moz-cite-prefix">[root@psmdsana1501 ~]# lctl ping
      172.21.52.88@o2ib<br>
      12345-0@lo<br>
      12345-172.21.52.88@o2ib<br>
      12345-172.21.48.250@tcp<br>
    </div>
    <div class="moz-cite-prefix"><br>
    </div>
    <div class="moz-cite-prefix"><br>
    </div>
    <div class="moz-cite-prefix">this is the mds lnet.conf<br>
    </div>
    <div class="moz-cite-prefix"><br>
    </div>
    <div class="moz-cite-prefix">ip2nets:<br>
       - net-spec: tcp<br>
         interfaces:<br>
            0: eth0</div>
    <div class="moz-cite-prefix"><br>
    </div>
    <div class="moz-cite-prefix">then I did as you suggested on the MDS:</div>
    <div class="moz-cite-prefix"><br>
    </div>
    <div class="moz-cite-prefix">lctl set_param debug=+"net neterror"</div>
    <div class="moz-cite-prefix"><br>
    </div>
    <div class="moz-cite-prefix">lctl ping 172.21.48.250<br>
      12345-0@lo<br>
      12345-172.21.52.88@o2ib<br>
      12345-172.21.48.250@tcp<br>
    </div>
    <div class="moz-cite-prefix"><br>
    </div>
    <div class="moz-cite-prefix"><br>
    </div>
    <div class="moz-cite-prefix">LOG file:</div>
    <div class="moz-cite-prefix"><br>
    </div>
    <div class="moz-cite-prefix">00000800:00000200:3.0F:1551827094.376143:0:17197:0:(socklnd.c:195:ksocknal_find_peer_locked())
      got peer_ni [ffff8e0f3a312100] -> 12345-172.21.49.100@tcp (4)<br>
00000800:00000200:3.0:1551827094.376155:0:17197:0:(socklnd_cb.c:757:ksocknal_queue_tx_locked())
      Sending to 12345-172.21.49.100@tcp ip 172.21.49.100:1021<br>
00000800:00000200:3.0:1551827094.376158:0:17197:0:(socklnd_cb.c:776:ksocknal_queue_tx_locked())
      Packet ffff8e0f3d32d800 type 192, nob 24 niov 1 nkiov 0<br>
00000800:00000200:3.0:1551827094.376312:0:17200:0:(socklnd_cb.c:549:ksocknal_process_transmit())
      send(0) 0<br>
00000800:00000200:3.0:1551827097.376102:0:17197:0:(socklnd.c:195:ksocknal_find_peer_locked())
      got peer_ni [ffff8e0f346f9400] -> 12345-172.21.49.110@tcp (4)<br>
00000800:00000200:3.0:1551827097.376110:0:17197:0:(socklnd_cb.c:757:ksocknal_queue_tx_locked())
      Sending to 12345-172.21.49.110@tcp ip 172.21.49.110:1021<br>
00000800:00000200:3.0:1551827097.376114:0:17197:0:(socklnd_cb.c:776:ksocknal_queue_tx_locked())
      Packet ffff8e0f3d32d800 type 192, nob 24 niov 1 nkiov 0<br>
00000800:00000200:3.0:1551827097.376135:0:17197:0:(socklnd.c:195:ksocknal_find_peer_locked())
      got peer_ni [ffff8e0f3d32d000] -> 12345-172.21.48.69@tcp (4)<br>
00000800:00000200:3.0:1551827097.376140:0:17197:0:(socklnd_cb.c:757:ksocknal_queue_tx_locked())
      Sending to 12345-172.21.48.69@tcp ip 172.21.48.69:1021<br>
00000800:00000200:3.0:1551827097.376143:0:17197:0:(socklnd_cb.c:776:ksocknal_queue_tx_locked())
      Packet ffff8e0f3a312b00 type 192, nob 24 niov 1 nkiov 0<br>
00000800:00000200:3.0:1551827097.376199:0:17197:0:(socklnd.c:195:ksocknal_find_peer_locked())
      got peer_ni [ffff8e0f3d32c100] -> 12345-172.21.48.207@tcp (4)<br>
00000800:00000200:3.0:1551827097.376206:0:17197:0:(socklnd_cb.c:757:ksocknal_queue_tx_locked())
      Sending to 12345-172.21.48.207@tcp ip 172.21.48.207:1021<br>
00000800:00000200:3.0:1551827097.376208:0:17197:0:(socklnd_cb.c:776:ksocknal_queue_tx_locked())
      Packet ffff8e0f3d32cc00 type 192, nob 24 niov 1 nkiov 0<br>
00000800:00000200:3.0:1551827097.376322:0:17200:0:(socklnd_cb.c:549:ksocknal_process_transmit())
      send(0) 0<br>
00000800:00000200:3.0:1551827097.376339:0:17200:0:(socklnd_cb.c:549:ksocknal_process_transmit())
      send(0) 0<br>
00000800:00000200:3.0:1551827097.376351:0:17200:0:(socklnd_cb.c:549:ksocknal_process_transmit())
      send(0) 0<br>
00000800:00000200:3.0:1551827098.376136:0:17197:0:(socklnd.c:195:ksocknal_find_peer_locked())
      got peer_ni [ffff8e0f49a16c00] -> 12345-172.21.50.66@tcp (4)<br>
00000800:00000200:3.0:1551827098.376142:0:17197:0:(socklnd_cb.c:757:ksocknal_queue_tx_locked())
      Sending to 12345-172.21.50.66@tcp ip 172.21.50.66:1021<br>
00000800:00000200:3.0:1551827098.376145:0:17197:0:(socklnd_cb.c:776:ksocknal_queue_tx_locked())
      Packet ffff8e0f3d32cc00 type 192, nob 24 niov 1 nkiov 0<br>
00000800:00000200:3.0:1551827098.376302:0:17200:0:(socklnd_cb.c:549:ksocknal_process_transmit())
      send(0) 0<br>
00000800:00000200:3.0:1551827099.376211:0:17197:0:(socklnd.c:195:ksocknal_find_peer_locked())
      got peer_ni [ffff8e0f3a312700] -> 12345-172.21.50.25@tcp (4)<br>
00000800:00000200:3.0:1551827099.376217:0:17197:0:(socklnd_cb.c:757:ksocknal_queue_tx_locked())
      Sending to 12345-172.21.50.25@tcp ip 172.21.50.25:1021<br>
00000800:00000200:3.0:1551827099.376220:0:17197:0:(socklnd_cb.c:776:ksocknal_queue_tx_locked())
      Packet ffff8e0f3d32cc00 type 192, nob 24 niov 1 nkiov 0<br>
00000800:00000200:3.0:1551827099.376350:0:17200:0:(socklnd_cb.c:549:ksocknal_process_transmit())
      send(0) 0<br>
00000800:00000200:3.0:1551827102.376181:0:17197:0:(socklnd.c:195:ksocknal_find_peer_locked())
      got peer_ni [ffff8e0f3d32dc00] -> 12345-172.21.48.230@tcp (4)<br>
00000800:00000200:3.0:1551827102.376189:0:17197:0:(socklnd_cb.c:757:ksocknal_queue_tx_locked())
      Sending to 12345-172.21.48.230@tcp ip 172.21.48.230:1021<br>
00000800:00000200:3.0:1551827102.376192:0:17197:0:(socklnd_cb.c:776:ksocknal_queue_tx_locked())
      Packet ffff8e0f3d32cc00 type 192, nob 24 niov 1 nkiov 0<br>
00000800:00000200:3.0:1551827102.376377:0:17200:0:(socklnd_cb.c:549:ksocknal_process_transmit())
      send(0) 0<br>
00000800:00000200:3.0:1551827103.376099:0:17197:0:(socklnd.c:195:ksocknal_find_peer_locked())
      got peer_ni [ffff8e0f3a313400] -> 12345-172.21.48.151@tcp (4)<br>
00000800:00000200:3.0:1551827103.376105:0:17197:0:(socklnd_cb.c:757:ksocknal_queue_tx_locked())
      Sending to 12345-172.21.48.151@tcp ip 172.21.48.151:1021<br>
00000800:00000200:3.0:1551827103.376108:0:17197:0:(socklnd_cb.c:776:ksocknal_queue_tx_locked())
      Packet ffff8e0f3d32cc00 type 192, nob 24 niov 1 nkiov 0<br>
00000800:00000200:3.0:1551827103.376166:0:17197:0:(socklnd.c:195:ksocknal_find_peer_locked())
      got peer_ni [ffff8e0f3d32d600] -> 12345-172.21.48.208@tcp (4)<br>
00000800:00000200:3.0:1551827103.376169:0:17197:0:(socklnd_cb.c:757:ksocknal_queue_tx_locked())
      Sending to 12345-172.21.48.208@tcp ip 172.21.48.208:1021<br>
00000800:00000200:3.0:1551827103.376170:0:17197:0:(socklnd_cb.c:776:ksocknal_queue_tx_locked())
      Packet ffff8e0f3a312b00 type 192, nob 24 niov 1 nkiov 0<br>
00000800:00000200:3.0:1551827103.376176:0:17197:0:(socklnd.c:195:ksocknal_find_peer_locked())
      got peer_ni [ffff8e0f3a313a00] -> 12345-172.21.48.210@tcp (4)<br>
00000800:00000200:3.0:1551827103.376178:0:17197:0:(socklnd_cb.c:757:ksocknal_queue_tx_locked())
      Sending to 12345-172.21.48.210@tcp ip 172.21.48.210:1021<br>
00000800:00000200:3.0:1551827103.376179:0:17197:0:(socklnd_cb.c:776:ksocknal_queue_tx_locked())
      Packet ffff8e0f3d32d800 type 192, nob 24 niov 1 nkiov 0<br>
00000800:00000200:3.0:1551827103.376187:0:17197:0:(socklnd.c:195:ksocknal_find_peer_locked())
      got peer_ni [ffff8e0f3d32c700] -> 12345-172.21.48.12@tcp (4)<br>
00000800:00000200:3.0:1551827103.376189:0:17197:0:(socklnd_cb.c:757:ksocknal_queue_tx_locked())
      Sending to 12345-172.21.48.12@tcp ip 172.21.48.12:1021<br>
00000800:00000200:3.0:1551827103.376190:0:17197:0:(socklnd_cb.c:776:ksocknal_queue_tx_locked())
      Packet ffff8e0f3d32dd00 type 192, nob 24 niov 1 nkiov 0<br>
00000800:00000200:3.0:1551827103.376288:0:17200:0:(socklnd_cb.c:549:ksocknal_process_transmit())
      send(0) 0<br>
00000800:00000200:3.0:1551827103.376315:0:17200:0:(socklnd_cb.c:549:ksocknal_process_transmit())
      send(0) 0<br>
00000800:00000200:3.0:1551827103.376330:0:17200:0:(socklnd_cb.c:549:ksocknal_process_transmit())
      send(0) 0<br>
00000800:00000200:3.0:1551827103.376345:0:17200:0:(socklnd_cb.c:549:ksocknal_process_transmit())
      send(0) 0<br>
00000800:00000200:3.0:1551827104.376194:0:17197:0:(socklnd.c:195:ksocknal_find_peer_locked())
      got peer_ni [ffff8e0f346f9600] -> 12345-172.21.49.171@tcp (4)<br>
00000800:00000200:3.0:1551827104.376201:0:17197:0:(socklnd_cb.c:757:ksocknal_queue_tx_locked())
      Sending to 12345-172.21.49.171@tcp ip 172.21.49.171:1021<br>
00000800:00000200:3.0:1551827104.376208:0:17197:0:(socklnd_cb.c:776:ksocknal_queue_tx_locked())
      Packet ffff8e0f3d32dd00 type 192, nob 24 niov 1 nkiov 0<br>
00000800:00000200:3.0:1551827104.376243:0:17197:0:(socklnd.c:195:ksocknal_find_peer_locked())
      got peer_ni [ffff8e0b75c59c00] -> 12345-172.21.48.150@tcp (4)<br>
00000800:00000200:3.0:1551827104.376247:0:17197:0:(socklnd_cb.c:757:ksocknal_queue_tx_locked())
      Sending to 12345-172.21.48.150@tcp ip 172.21.48.150:1021<br>
00000800:00000200:3.0:1551827104.376249:0:17197:0:(socklnd_cb.c:776:ksocknal_queue_tx_locked())
      Packet ffff8e0f3d32d800 type 192, nob 24 niov 1 nkiov 0<br>
00000800:00000200:3.0:1551827104.376265:0:17197:0:(socklnd.c:195:ksocknal_find_peer_locked())
      got peer_ni [ffff8e0f3d32da00] -> 12345-172.21.48.60@tcp (4)<br>
00000800:00000200:3.0:1551827104.376270:0:17197:0:(socklnd_cb.c:757:ksocknal_queue_tx_locked())
      Sending to 12345-172.21.48.60@tcp ip 172.21.48.60:1021<br>
00000800:00000200:3.0:1551827104.376272:0:17197:0:(socklnd_cb.c:776:ksocknal_queue_tx_locked())
      Packet ffff8e0f3a312b00 type 192, nob 24 niov 1 nkiov 0<br>
00000800:00000200:3.0:1551827104.376283:0:17197:0:(socklnd.c:195:ksocknal_find_peer_locked())
      got peer_ni [ffff8e0f3d32d200] -> 12345-172.21.48.68@tcp (4)<br>
00000800:00000200:3.0:1551827104.376287:0:17197:0:(socklnd_cb.c:757:ksocknal_queue_tx_locked())
      Sending to 12345-172.21.48.68@tcp ip 172.21.48.68:1021<br>
00000800:00000200:3.0:1551827104.376290:0:17197:0:(socklnd_cb.c:776:ksocknal_queue_tx_locked())
      Packet ffff8e0f3d32cc00 type 192, nob 24 niov 1 nkiov 0<br>
00000800:00000200:3.0:1551827104.376324:0:17197:0:(socklnd.c:195:ksocknal_find_peer_locked())
      got peer_ni [ffff8e0b75c59a00] -> 12345-172.21.49.25@tcp (4)<br>
00000800:00000200:3.0:1551827104.376327:0:17197:0:(socklnd_cb.c:757:ksocknal_queue_tx_locked())
      Sending to 12345-172.21.49.25@tcp ip 172.21.49.25:1021<br>
00000800:00000200:3.0:1551827104.376329:0:17197:0:(socklnd_cb.c:776:ksocknal_queue_tx_locked())
      Packet ffff8e0f3a312900 type 192, nob 24 niov 1 nkiov 0<br>
00000800:00000200:3.0:1551827104.376347:0:17197:0:(socklnd.c:195:ksocknal_find_peer_locked())
      got peer_ni [ffff8e0f49a17200] -> 12345-172.21.49.34@tcp (4)<br>
00000800:00000200:3.0:1551827104.376357:0:17197:0:(socklnd_cb.c:757:ksocknal_queue_tx_locked())
      Sending to 12345-172.21.49.34@tcp ip 172.21.49.34:1021<br>
00000800:00000200:3.0:1551827104.376358:0:17197:0:(socklnd_cb.c:776:ksocknal_queue_tx_locked())
      Packet ffff8e0f3d32d900 type 192, nob 24 niov 1 nkiov 0<br>
00000800:00000200:3.0:1551827104.376365:0:17197:0:(socklnd.c:195:ksocknal_find_peer_locked())
      got peer_ni [ffff8e0f3d32df00] -> 12345-172.21.49.37@tcp (4)<br>
00000800:00000200:3.0:1551827104.376372:0:17197:0:(socklnd_cb.c:757:ksocknal_queue_tx_locked())
      Sending to 12345-172.21.49.37@tcp ip 172.21.49.37:1021<br>
00000800:00000200:3.0:1551827104.376374:0:17197:0:(socklnd_cb.c:776:ksocknal_queue_tx_locked())
      Packet ffff8e0f3a312300 type 192, nob 24 niov 1 nkiov 0<br>
00000800:00000200:3.0:1551827104.376459:0:17200:0:(socklnd_cb.c:549:ksocknal_process_transmit())
      send(0) 0<br>
00000800:00000200:3.0:1551827104.376483:0:17200:0:(socklnd_cb.c:549:ksocknal_process_transmit())
      send(0) 0<br>
00000800:00000200:3.0:1551827104.376503:0:17200:0:(socklnd_cb.c:549:ksocknal_process_transmit())
      send(0) 0<br>
00000800:00000200:3.0:1551827104.376522:0:17200:0:(socklnd_cb.c:549:ksocknal_process_transmit())
      send(0) 0<br>
00000800:00000200:3.0:1551827104.376542:0:17200:0:(socklnd_cb.c:549:ksocknal_process_transmit())
      send(0) 0<br>
00000800:00000200:3.0:1551827104.376599:0:17200:0:(socklnd_cb.c:549:ksocknal_process_transmit())
      send(0) 0<br>
00000800:00000200:3.0:1551827104.376620:0:17200:0:(socklnd_cb.c:549:ksocknal_process_transmit())
      send(0) 0<br>
00000800:00000200:3.0:1551827105.376168:0:17197:0:(socklnd.c:195:ksocknal_find_peer_locked())
      got peer_ni [ffff8e0f3a312d00] -> 12345-172.21.48.225@tcp (4)<br>
00000800:00000200:3.0:1551827105.376174:0:17197:0:(socklnd_cb.c:757:ksocknal_queue_tx_locked())
      Sending to 12345-172.21.48.225@tcp ip 172.21.48.225:1021<br>
00000800:00000200:3.0:1551827105.376177:0:17197:0:(socklnd_cb.c:776:ksocknal_queue_tx_locked())
      Packet ffff8e0f3a312300 type 192, nob 24 niov 1 nkiov 0<br>
00000800:00000200:3.0:1551827105.376210:0:17197:0:(socklnd.c:195:ksocknal_find_peer_locked())
      got peer_ni [ffff8e0f3a313b00] -> 12345-172.21.49.105@tcp (4)<br>
00000800:00000200:3.0:1551827105.376213:0:17197:0:(socklnd_cb.c:757:ksocknal_queue_tx_locked())
      Sending to 12345-172.21.49.105@tcp ip 172.21.49.105:1021<br>
00000800:00000200:3.0:1551827105.376214:0:17197:0:(socklnd_cb.c:776:ksocknal_queue_tx_locked())
      Packet ffff8e0f3d32d900 type 192, nob 24 niov 1 nkiov 0<br>
00000800:00000200:3.0:1551827105.376289:0:17200:0:(socklnd_cb.c:549:ksocknal_process_transmit())
      send(0) 0<br>
00000800:00000200:3.0:1551827105.376322:0:17200:0:(socklnd_cb.c:549:ksocknal_process_transmit())
      send(0) 0<br>
00000800:00000200:3.0:1551827106.376096:0:17197:0:(socklnd.c:195:ksocknal_find_peer_locked())
      got peer_ni [ffff8e0f3a312400] -> 12345-172.21.50.22@tcp (4)<br>
00000800:00000200:3.0:1551827106.376102:0:17197:0:(socklnd_cb.c:757:ksocknal_queue_tx_locked())
      Sending to 12345-172.21.50.22@tcp ip 172.21.50.22:1021<br>
00000800:00000200:3.0:1551827106.376105:0:17197:0:(socklnd_cb.c:776:ksocknal_queue_tx_locked())
      Packet ffff8e0f3d32d900 type 192, nob 24 niov 1 nkiov 0<br>
00000800:00000200:3.0:1551827106.376180:0:17200:0:(socklnd_cb.c:549:ksocknal_process_transmit())
      send(0) 0<br>
00000800:00000200:3.0:1551827107.376127:0:17197:0:(socklnd.c:195:ksocknal_find_peer_locked())
      got peer_ni [ffff8e0f3a313300] -> 12345-172.21.49.33@tcp (4)<br>
00000800:00000200:3.0:1551827107.376135:0:17197:0:(socklnd_cb.c:757:ksocknal_queue_tx_locked())
      Sending to 12345-172.21.49.33@tcp ip 172.21.49.33:1021<br>
00000800:00000200:3.0:1551827107.376139:0:17197:0:(socklnd_cb.c:776:ksocknal_queue_tx_locked())
      Packet ffff8e0f3d32d900 type 192, nob 24 niov 1 nkiov 0<br>
00000800:00000200:3.0:1551827107.376167:0:17197:0:(socklnd.c:195:ksocknal_find_peer_locked())
      got peer_ni [ffff8e0f3a312c00] -> 12345-172.21.32.60@tcp (4)<br>
00000800:00000200:3.0:1551827107.376171:0:17197:0:(socklnd_cb.c:757:ksocknal_queue_tx_locked())
      Sending to 12345-172.21.32.60@tcp ip 172.21.32.60:1021<br>
00000800:00000200:3.0:1551827107.376173:0:17197:0:(socklnd_cb.c:776:ksocknal_queue_tx_locked())
      Packet ffff8e0f3a312300 type 192, nob 24 niov 1 nkiov 0<br>
00000800:00000200:3.0:1551827107.376317:0:17200:0:(socklnd_cb.c:549:ksocknal_process_transmit())
      send(0) 0<br>
00000800:00000200:3.0:1551827107.376347:0:17200:0:(socklnd_cb.c:549:ksocknal_process_transmit())
      send(0) 0<br>
00000400:00000200:3.0:1551827107.898298:0:17198:0:(lib-move.c:4114:lnet_parse())
      TRACE: 172.21.49.233@tcp(172.21.49.233@tcp) <-
      172.21.49.100@tcp : PUT - for me<br>
00000400:00000200:3.0:1551827107.898314:0:17198:0:(lib-ptl.c:571:lnet_ptl_match_md())
      Request from 12345-172.21.49.100@tcp of length 520 into portal 26
      MB=0x5c3e6495f2d10<br>
00000400:00000100:3.0:1551827107.898320:0:17198:0:(lib-move.c:3753:lnet_parse_put())
      Dropping PUT from 12345-172.21.49.100@tcp portal 26 match
      1622768724421904 offset 0 length 520: 4<br>
00000400:00000200:3.0:1551827107.898323:0:17198:0:(lib-msg.c:816:lnet_is_health_check())
      health check = 0, status = -2, hstatus = 0<br>
00000400:00000200:3.0:1551827107.898336:0:17198:0:(lib-move.c:4114:lnet_parse())
      TRACE: 172.21.49.233@tcp(172.21.49.233@tcp) <-
      172.21.49.100@tcp : PUT - for me<br>
00000400:00000200:3.0:1551827107.898340:0:17198:0:(lib-ptl.c:571:lnet_ptl_match_md())
      Request from 12345-172.21.49.100@tcp of length 520 into portal 12
      MB=0x5c3e6495f2d50<br>
00000400:00000100:3.0:1551827107.898342:0:17198:0:(lib-move.c:3753:lnet_parse_put())
      Dropping PUT from 12345-172.21.49.100@tcp portal 12 match
      1622768724421968 offset 0 length 520: 4<br>
00000400:00000200:3.0:1551827107.898344:0:17198:0:(lib-msg.c:816:lnet_is_health_check())
      health check = 0, status = -2, hstatus = 0<br>
00000800:00000200:3.0:1551827108.376065:0:17197:0:(socklnd.c:195:ksocknal_find_peer_locked())
      got peer_ni [ffff8e0f3a312800] -> 12345-172.21.48.236@tcp (4)<br>
00000800:00000200:3.0:1551827108.376072:0:17197:0:(socklnd_cb.c:757:ksocknal_queue_tx_locked())
      Sending to 12345-172.21.48.236@tcp ip 172.21.48.236:1021<br>
00000800:00000200:3.0:1551827108.376075:0:17197:0:(socklnd_cb.c:776:ksocknal_queue_tx_locked())
      Packet ffff8e0f3a312300 type 192, nob 24 niov 1 nkiov 0<br>
00000800:00000200:3.0:1551827108.376124:0:17197:0:(socklnd.c:195:ksocknal_find_peer_locked())
      got peer_ni [ffff8e0f3a313f00] -> 12345-172.21.49.31@tcp (4)<br>
00000800:00000200:3.0:1551827108.376127:0:17197:0:(socklnd_cb.c:757:ksocknal_queue_tx_locked())
      Sending to 12345-172.21.49.31@tcp ip 172.21.49.31:1021<br>
00000800:00000200:3.0:1551827108.376128:0:17197:0:(socklnd_cb.c:776:ksocknal_queue_tx_locked())
      Packet ffff8e0f3d32d900 type 192, nob 24 niov 1 nkiov 0<br>
00000800:00000200:3.0:1551827108.376230:0:17200:0:(socklnd_cb.c:549:ksocknal_process_transmit())
      send(0) 0<br>
00000800:00000200:3.0:1551827108.376251:0:17200:0:(socklnd_cb.c:549:ksocknal_process_transmit())
      send(0) 0<br>
00000800:00000200:3.0:1551827109.376098:0:17197:0:(socklnd.c:195:ksocknal_find_peer_locked())
      got peer_ni [ffff8e0f3a313c00] -> 12345-172.21.48.219@tcp (4)<br>
00000800:00000200:3.0:1551827109.376106:0:17197:0:(socklnd_cb.c:757:ksocknal_queue_tx_locked())
      Sending to 12345-172.21.48.219@tcp ip 172.21.48.219:1021<br>
00000800:00000200:3.0:1551827109.376109:0:17197:0:(socklnd_cb.c:776:ksocknal_queue_tx_locked())
      Packet ffff8e0f3d32d900 type 192, nob 24 niov 1 nkiov 0<br>
00000800:00000200:3.0:1551827109.376283:0:17200:0:(socklnd_cb.c:549:ksocknal_process_transmit())
      send(0) 0<br>
00000400:00000080:3.0:1551827109.465749:0:17286:0:(module.c:90:libcfs_ioctl())
      libcfs ioctl cmd 3221775678<br>
00000400:00000200:3.0:1551827109.465771:0:17286:0:(lib-move.c:4783:LNetGet())
      LNetGet -> 12345-172.21.48.250@tcp<br>
00000400:00000200:3.0:1551827109.465780:0:17286:0:(lib-move.c:2450:lnet_handle_send_case_locked())
      Source ANY to NMR:  172.21.48.250@tcp local destination<br>
00000400:00000200:3.0:1551827109.465785:0:17286:0:(lib-move.c:1714:lnet_handle_send())
      rspt_next_hop_nid = 172.21.48.250@tcp<br>
00000400:00000200:3.0:1551827109.465792:0:17286:0:(lib-move.c:1728:lnet_handle_send())
      TRACE: 172.21.49.233@tcp(172.21.49.233@tcp:<?>) ->
      172.21.48.250@tcp(<a class="moz-txt-link-abbreviated" href="mailto:172.21.48.250@tcp:172.21.48.250@tcp">172.21.48.250@tcp:172.21.48.250@tcp</a>) : GET try#
      0<br>
00000800:00000200:3.0:1551827109.465798:0:17286:0:(socklnd_cb.c:996:ksocknal_send())
      sending 0 bytes in 0 frags to 12345-172.21.48.250@tcp<br>
00000800:00000200:3.0:1551827109.465804:0:17286:0:(socklnd.c:195:ksocknal_find_peer_locked())
      got peer_ni [ffff8e0f49a9be00] -> 12345-172.21.48.250@tcp (5)<br>
00000800:00000200:3.0:1551827109.465812:0:17286:0:(socklnd_cb.c:757:ksocknal_queue_tx_locked())
      Sending to 12345-172.21.48.250@tcp ip 172.21.48.250:988<br>
00000800:00000200:3.0:1551827109.465815:0:17286:0:(socklnd_cb.c:776:ksocknal_queue_tx_locked())
      Packet ffff8e0f307a7300 type 2, nob 96 niov 1 nkiov 0<br>
00000800:00000200:3.0:1551827109.465874:0:17198:0:(socklnd_cb.c:549:ksocknal_process_transmit())
      send(0) 0<br>
00000400:00000200:3.0:1551827109.465883:0:17198:0:(lib-msg.c:816:lnet_is_health_check())
      health check = 1, status = 0, hstatus = 0<br>
00000400:00000200:3.0:1551827109.465887:0:17198:0:(lib-msg.c:630:lnet_health_check())
      health check: 172.21.49.233@tcp->172.21.48.250@tcp: GET: OK<br>
00000400:00000200:3.0:1551827109.465896:0:17286:0:(api-ni.c:4045:lnet_ping())
      poll 1(5 0)<br>
00000400:00000200:3.0:1551827109.466442:0:17198:0:(lib-move.c:4114:lnet_parse())
      TRACE: 172.21.49.233@tcp(172.21.49.233@tcp) <-
      172.21.48.250@tcp : REPLY - for me<br>
00000400:00000200:3.0:1551827109.466449:0:17198:0:(lib-move.c:3881:lnet_parse_reply())
      172.21.49.233@tcp: Reply from 12345-172.21.48.250@tcp of length
      64/64 into md 0x1d<br>
00000400:00000200:3.0:1551827109.466459:0:17198:0:(lib-md.c:69:lnet_md_unlink())
      Unlinking md ffff8e0f37258088<br>
00000400:00000200:3.0:1551827109.466460:0:17198:0:(lib-msg.c:816:lnet_is_health_check())
      health check = 0, status = 0, hstatus = 0<br>
00000400:00000200:3.0:1551827109.466466:0:17286:0:(api-ni.c:4045:lnet_ping())
      poll 1(3 0) unlinked<br>
00000800:00000200:3.0:1551827110.376074:0:17197:0:(socklnd.c:195:ksocknal_find_peer_locked())
      got peer_ni [ffff8e0f37728500] -> 12345-172.21.48.203@tcp (4)<br>
00000800:00000200:3.0:1551827110.376084:0:17197:0:(socklnd_cb.c:757:ksocknal_queue_tx_locked())
      Sending to 12345-172.21.48.203@tcp ip 172.21.48.203:1021<br>
00000800:00000200:3.0:1551827110.376089:0:17197:0:(socklnd_cb.c:776:ksocknal_queue_tx_locked())
      Packet ffff8e0f3d32d900 type 192, nob 24 niov 1 nkiov 0<br>
00000800:00000200:3.0:1551827110.376124:0:17197:0:(socklnd.c:195:ksocknal_find_peer_locked())
      got peer_ni [ffff8e0f37728f00] -> 12345-172.21.48.11@tcp (4)<br>
00000800:00000200:3.0:1551827110.376127:0:17197:0:(socklnd_cb.c:757:ksocknal_queue_tx_locked())
      Sending to 12345-172.21.48.11@tcp ip 172.21.48.11:1021<br>
00000800:00000200:3.0:1551827110.376129:0:17197:0:(socklnd_cb.c:776:ksocknal_queue_tx_locked())
      Packet ffff8e0f3a312300 type 192, nob 24 niov 1 nkiov 0<br>
00000800:00000200:3.0:1551827110.376141:0:17197:0:(socklnd.c:195:ksocknal_find_peer_locked())
      got peer_ni [ffff8e0f37729f00] -> 12345-172.21.48.223@tcp (4)<br>
00000800:00000200:3.0:1551827110.376143:0:17197:0:(socklnd_cb.c:757:ksocknal_queue_tx_locked())
      Sending to 12345-172.21.48.223@tcp ip 172.21.48.223:1021<br>
00000800:00000200:3.0:1551827110.376145:0:17197:0:(socklnd_cb.c:776:ksocknal_queue_tx_locked())
      Packet ffff8e0f3a312900 type 192, nob 24 niov 1 nkiov 0<br>
00000800:00000200:3.0:1551827110.376207:0:17197:0:(socklnd.c:195:ksocknal_find_peer_locked())
      got peer_ni [ffff8e0f37729900] -> 12345-172.21.48.71@tcp (4)<br>
00000800:00000200:3.0:1551827110.376211:0:17197:0:(socklnd_cb.c:757:ksocknal_queue_tx_locked())
      Sending to 12345-172.21.48.71@tcp ip 172.21.48.71:1021<br>
00000800:00000200:3.0:1551827110.376212:0:17197:0:(socklnd_cb.c:776:ksocknal_queue_tx_locked())
      Packet ffff8e0f3d32cc00 type 192, nob 24 niov 1 nkiov 0<br>
00000800:00000200:3.0:1551827110.376279:0:17200:0:(socklnd_cb.c:549:ksocknal_process_transmit())
      send(0) 0<br>
00000800:00000200:3.0:1551827110.376290:0:17200:0:(socklnd_cb.c:549:ksocknal_process_transmit())
      send(0) 0<br>
00000800:00000200:3.0:1551827110.376303:0:17200:0:(socklnd_cb.c:549:ksocknal_process_transmit())
      send(0) 0<br>
00000800:00000200:3.0:1551827110.376322:0:17200:0:(socklnd_cb.c:549:ksocknal_process_transmit())
      send(0) 0<br>
00000400:00000200:3.0:1551827110.730073:0:17198:0:(lib-move.c:4114:lnet_parse())
      TRACE: 172.21.49.233@tcp(172.21.49.233@tcp) <- 172.21.48.69@tcp
      : PUT - for me<br>
00000400:00000200:3.0:1551827110.730085:0:17198:0:(lib-ptl.c:571:lnet_ptl_match_md())
      Request from 12345-172.21.48.69@tcp of length 520 into portal 26
      MB=0x5c4c25c755750<br>
00000400:00000100:3.0:1551827110.730091:0:17198:0:(lib-move.c:3753:lnet_parse_put())
      Dropping PUT from 12345-172.21.48.69@tcp portal 26 match
      1623713937446736 offset 0 length 520: 4<br>
00000400:00000200:3.0:1551827110.730093:0:17198:0:(lib-msg.c:816:lnet_is_health_check())
      health check = 0, status = -2, hstatus = 0<br>
00000400:00000200:3.0:1551827110.730103:0:17198:0:(lib-move.c:4114:lnet_parse())
      TRACE: 172.21.49.233@tcp(172.21.49.233@tcp) <- 172.21.48.69@tcp
      : PUT - for me<br>
00000400:00000200:3.0:1551827110.730106:0:17198:0:(lib-ptl.c:571:lnet_ptl_match_md())
      Request from 12345-172.21.48.69@tcp of length 520 into portal 12
      MB=0x5c4c25c755760<br>
00000400:00000100:3.0:1551827110.730113:0:17198:0:(lib-move.c:3753:lnet_parse_put())
      Dropping PUT from 12345-172.21.48.69@tcp portal 12 match
      1623713937446752 offset 0 length 520: 4<br>
00000400:00000200:3.0:1551827110.730115:0:17198:0:(lib-msg.c:816:lnet_is_health_check())
      health check = 0, status = -2, hstatus = 0<br>
00000800:00000200:3.0:1551827111.376066:0:17197:0:(socklnd.c:195:ksocknal_find_peer_locked())
      got peer_ni [ffff8e0f37728c00] -> 12345-172.21.48.72@tcp (4)<br>
00000800:00000200:3.0:1551827111.376072:0:17197:0:(socklnd_cb.c:757:ksocknal_queue_tx_locked())
      Sending to 12345-172.21.48.72@tcp ip 172.21.48.72:1021<br>
00000800:00000200:3.0:1551827111.376075:0:17197:0:(socklnd_cb.c:776:ksocknal_queue_tx_locked())
      Packet ffff8e0f3d32cc00 type 192, nob 24 niov 1 nkiov 0<br>
00000800:00000200:3.0:1551827111.376119:0:17197:0:(socklnd.c:195:ksocknal_find_peer_locked())
      got peer_ni [ffff8e0f37729600] -> 12345-172.21.48.204@tcp (4)<br>
00000800:00000200:3.0:1551827111.376121:0:17197:0:(socklnd_cb.c:757:ksocknal_queue_tx_locked())
      Sending to 12345-172.21.48.204@tcp ip 172.21.48.204:1021<br>
00000800:00000200:3.0:1551827111.376122:0:17197:0:(socklnd_cb.c:776:ksocknal_queue_tx_locked())
      Packet ffff8e0f3a312900 type 192, nob 24 niov 1 nkiov 0<br>
00000800:00000200:3.0:1551827111.376218:0:17200:0:(socklnd_cb.c:549:ksocknal_process_transmit())
      send(0) 0<br>
00000800:00000200:3.0:1551827111.376230:0:17200:0:(socklnd_cb.c:549:ksocknal_process_transmit())
      send(0) 0<br>
      Debug log: 162 lines, 162 kept, 0 dropped, 0 bad.<br>
    </div>
    <div class="moz-cite-prefix"><br>
    </div>
    <div class="moz-cite-prefix">On 3/5/19 2:15 PM, Amir Shehata wrote:<br>
    </div>
    <blockquote type="cite"
cite="mid:CAAqp6i0McPG0qhsPr8KqFjD-KCRv1BoYinJOxq_FvrXuXYqFjA@mail.gmail.com">
      <meta http-equiv="content-type" content="text/html; charset=UTF-8">
      <div dir="ltr">
        <div dir="ltr">Take a look at this: <a
            href="https://jira.whamcloud.com/browse/LU-11840"
            moz-do-not-send="true">https://jira.whamcloud.com/browse/LU-11840</a></div>
        <div>Let me know if this is the same issue you're seeing.<br>
        </div>
      </div>
      <br>
      <div class="gmail_quote">
        <div dir="ltr" class="gmail_attr">On Tue, 5 Mar 2019 at 14:04,
          Amir Shehata <<a
            href="mailto:amir.shehata.whamcloud@gmail.com"
            moz-do-not-send="true">amir.shehata.whamcloud@gmail.com</a>>
          wrote:<br>
        </div>
        <blockquote class="gmail_quote" style="margin:0px 0px 0px
          0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">
          <div dir="ltr">
            <div>Hi Riccardo,</div>
            <div><br>
            </div>
            <div>It's not LNet Health. It's Dynamic Discovery. What's
              happening is that 2.12 is discovering all the interfaces
              on the peer. That's why you see all the interfaces in the
              peer show.</div>
            <div><br>
            </div>
            <div>Multi-Rail doesn't enable o2ib. It just sees it. If the
              node doing the discovery has only tcp, then it should
              never try to connect over the o2ib. <br>
            </div>
            <div><br>
            </div>
            <div>Are you able to do a "lnetctl ping 172.21.48.250@tcp"
              from the MDS multiple times? Do you see the ping failing
              intermittently?</div>
            <div><br>
            </div>
            <div>What should happen is that when the MDS (running 2.12)
              tries to talk to the peer you have identified, then it'll
              discover its interfaces. But then should realize that it
              can only reach it on the tcp network, since that's the
              only network configured on the MDS.</div>
            <div><br>
            </div>
            <div>It might help, if you just configure LNet only, on the
              MDS and the peer and run a simple</div>
            <div>lctl set_param debug=+"net neterror"<br>
            </div>
            <div> lnetctl ping <></div>
            <div>lctl dk >log</div>
            <div><br>
            </div>
            <div>If you can share the debug output, it'll help to
              pinpoint the problem.</div>
            <div><br>
            </div>
            <div>thanks</div>
            <div>amir<br>
            </div>
          </div>
          <br>
          <div class="gmail_quote">
            <div dir="ltr" class="gmail_attr">On Tue, 5 Mar 2019 at
              12:30, Riccardo Veraldi <<a
                href="mailto:Riccardo.Veraldi@cnaf.infn.it"
                target="_blank" moz-do-not-send="true">Riccardo.Veraldi@cnaf.infn.it</a>>
              wrote:<br>
            </div>
            <blockquote class="gmail_quote" style="margin:0px 0px 0px
              0.8ex;border-left:1px solid
              rgb(204,204,204);padding-left:1ex">I think I figured out
              the problem.<br>
              My problem is related to Lnet Network Health feature: <br>
              <a href="https://jira.whamcloud.com/browse/LU-9120"
                rel="noreferrer" target="_blank" moz-do-not-send="true">https://jira.whamcloud.com/browse/LU-9120</a><br>
              the lustre MDS and the lsutre client having same version
              2.12.0 <br>
              negotiate a Multi-rail peer connection while this does not
              happen with <br>
              the other clients (2.10.5). So what happens is that both
              IB and tcp are <br>
              being used during transfers.<br>
              tcp is only for connecting to the MDS, IB only to connect
              to the OSS <br>
              anyway Multi-rail is enabled by default between the
              MDS,OSS and client. <br>
              This messes up the situation. the MDS has only one TCP
              interface and <br>
              cannot communicate by IB but in the "lnetctl peer show" a
              NID @o2ib <br>
              shows up and it should not. At this point the MDS tries to
              connect to <br>
              the client using IB and it will never work because there
              is no IB on the <br>
              MDS.<br>
              MDS Lnet configuration:<br>
              <br>
              net:<br>
                   - net type: lo<br>
                     local NI(s):<br>
                       - nid: 0@lo<br>
                         status: up<br>
                   - net type: tcp<br>
                     local NI(s):<br>
                       - nid: 172.21.49.233@tcp<br>
                         status: up<br>
                         interfaces:<br>
                             0: eth0<br>
              <br>
              but if I look at lnetctl peer show I See<br>
              <br>
                  - primary nid: 172.21.52.88@o2ib<br>
                     Multi-Rail: True<br>
                     peer ni:<br>
                       - nid: 172.21.48.250@tcp<br>
                         state: NA<br>
                       - nid: 172.21.52.88@o2ib<br>
                         state: NA<br>
                       - nid: 172.21.48.250@tcp1<br>
                         state: NA<br>
                       - nid: 172.21.48.250@tcp2<br>
                         state: NA<br>
              <br>
              there should be no o2ib nid but Multi-rail for some reason
              enables it.<br>
              I do not have problems with the other clients (non 2.12.0)<br>
              <br>
              How can I disable Multi-rail on 2.12.0 ??<br>
              <br>
              thank you<br>
              <br>
              <br>
              <br>
              On 3/5/19 12:14 PM, Patrick Farrell wrote:<br>
              > Riccardo,<br>
              ><br>
              > Since 2.12 is still a relatively new maintenance
              release, it would be helpful if you could open an LU and
              provide more detail there - Such as what clients were
              doing, if you were using any new features (like DoM or
              FLR), and full dmesg from the clients and servers involved
              in these evictions.<br>
              ><br>
              > - Patrick<br>
              ><br>
              > On 3/5/19, 11:50 AM, "lustre-discuss on behalf of
              Riccardo Veraldi" <<a
                href="mailto:lustre-discuss-bounces@lists.lustre.org"
                target="_blank" moz-do-not-send="true">lustre-discuss-bounces@lists.lustre.org</a>
              on behalf of <a
                href="mailto:Riccardo.Veraldi@cnaf.infn.it"
                target="_blank" moz-do-not-send="true">Riccardo.Veraldi@cnaf.infn.it</a>>
              wrote:<br>
              ><br>
              >      Hello,<br>
              >      <br>
              >      I have quite a big issue on my Lustre 2.12.0
              MDS/MDT.<br>
              >      <br>
              >      Clients moving data to the OSS occur into a
              locking problem I never met<br>
              >      before.<br>
              >      <br>
              >      The clients are mostly 2.10.5 except for one
              which is 2.12.0 but<br>
              >      regardless the client version the problem is
              still there.<br>
              >      <br>
              >      So these are the errors I see on hte MDS/MDT.
              When this happens<br>
              >      everything just hangs. If I reboot the MDS
              everything is back to<br>
              >      normality but it happened already 2 times in 3
              days and it is disrupting.<br>
              >      <br>
              >      Any hints ?<br>
              >      <br>
              >      Is it feasible to downgrade from 2.12.0 to
              2.10.6 ?<br>
              >      <br>
              >      thanks<br>
              >      <br>
              >      Mar  5 11:10:33 psmdsana1501 kernel: Lustre:<br>
              >     
              7898:0:(client.c:2132:ptlrpc_expire_one_request()) @@@
              Request sent has<br>
              >      failed due to network error: [sent
              1551813033/real 1551813033]<br>
              >      req@ffff9fdcbecd0300 x1626845000210688/t0(0)<br>
              >      o104-><a
                href="mailto:ana15-MDT0000@172.21.52." target="_blank"
                moz-do-not-send="true">ana15-MDT0000@172.21.52.</a>87@o2ib:15/16
              lens 296/224 e 0 to 1 dl<br>
              >      1551813044 ref 1 fl Rpc:eX/0/ffffffff rc 0/-1<br>
              >      Mar  5 11:10:33 psmdsana1501 kernel: Lustre:<br>
              >     
              7898:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped
              50552576<br>
              >      previous similar messages<br>
              >      Mar  5 11:13:03 psmdsana1501 kernel:
              LustreError:<br>
              >     
              7898:0:(ldlm_lockd.c:682:ldlm_handle_ast_error()) ###
              client (nid<br>
              >      172.21.52.87@o2ib) failed to reply to blocking
              AST (req@ffff9fdcbecd0300<br>
              >      x1626845000210688 status 0 rc -110), evict it
              ns: mdt-ana15-MDT0000_UUID<br>
              >      lock: ffff9fde9b6873c0/0x9824623d2148ef38 lrc:
              4/0,0 mode: PR/PR res:<br>
              >      [0x2000013a9:0x1d347:0x0].0x0 bits 0x13/0x0 rrc:
              5 type: IBT flags:<br>
              >      0x60200400000020 nid: 172.21.52.87@o2ib remote:
              0xd8efecd6e7621e63<br>
              >      expref: 8 pid: 7898 timeout: 333081 lvb_type: 0<br>
              >      Mar  5 11:13:03 psmdsana1501 kernel:
              LustreError: 138-a: ana15-MDT0000:<br>
              >      A client on nid 172.21.52.87@o2ib was evicted
              due to a lock blocking<br>
              >      callback time out: rc -110<br>
              >      Mar  5 11:13:03 psmdsana1501 kernel:
              LustreError:<br>
              >      5321:0:(ldlm_lockd.c:256:expired_lock_main())
              ### lock callback timer<br>
              >      expired after 150s: evicting client at
              172.21.52.87@o2ib ns:<br>
              >      mdt-ana15-MDT0000_UUID lock:
              ffff9fde9b6873c0/0x9824623d2148ef38 lrc:<br>
              >      3/0,0 mode: PR/PR res:
              [0x2000013a9:0x1d347:0x0].0x0 bits 0x13/0x0 rrc:<br>
              >      5 type: IBT flags: 0x60200400000020 nid:
              172.21.52.87@o2ib remote:<br>
              >      0xd8efecd6e7621e63 expref: 9 pid: 7898 timeout:
              0 lvb_type: 0<br>
              >      Mar  5 11:13:04 psmdsana1501 kernel: Lustre:
              ana15-MDT0000: Connection<br>
              >      restored to 59c5a826-f4e9-0dd0-8d4f-08c204f25941
              (at 172.21.52.87@o2ib)<br>
              >      Mar  5 11:15:34 psmdsana1501 kernel:
              LustreError:<br>
              >     
              7898:0:(ldlm_lockd.c:682:ldlm_handle_ast_error()) ###
              client (nid<br>
              >      172.21.52.142@o2ib) failed to reply to blocking
              AST<br>
              >      (req@ffff9fde2d393600 x1626845000213776 status 0
              rc -110), evict it ns:<br>
              >      mdt-ana15-MDT0000_UUID lock:
              ffff9fde9b6858c0/0x9824623d2148efee lrc:<br>
              >      4/0,0 mode: PR/PR res: [0x2000013ac:0x1:0x0].0x0
              bits 0x13/0x0 rrc: 3<br>
              >      type: IBT flags: 0x60200400000020 nid:
              172.21.52.142@o2ib remote:<br>
              >      0xbb35541ea6663082 expref: 9 pid: 7898 timeout:
              333232 lvb_type: 0<br>
              >      Mar  5 11:15:34 psmdsana1501 kernel:
              LustreError: 138-a: ana15-MDT0000:<br>
              >      A client on nid 172.21.52.142@o2ib was evicted
              due to a lock blocking<br>
              >      callback time out: rc -110<br>
              >      Mar  5 11:15:34 psmdsana1501 kernel:
              LustreError:<br>
              >      5321:0:(ldlm_lockd.c:256:expired_lock_main())
              ### lock callback timer<br>
              >      expired after 151s: evicting client at
              172.21.52.142@o2ib ns:<br>
              >      mdt-ana15-MDT0000_UUID lock:
              ffff9fde9b6858c0/0x9824623d2148efee lrc:<br>
              >      3/0,0 mode: PR/PR res: [0x2000013ac:0x1:0x0].0x0
              bits 0x13/0x0 rrc: 3<br>
              >      type: IBT flags: 0x60200400000020 nid:
              172.21.52.142@o2ib remote:<br>
              >      0xbb35541ea6663082 expref: 10 pid: 7898 timeout:
              0 lvb_type: 0<br>
              >      Mar  5 11:15:34 psmdsana1501 kernel: Lustre:
              ana15-MDT0000: Connection<br>
              >      restored to 9d49a115-646b-c006-fd85-000a4b90019a
              (at 172.21.52.142@o2ib)<br>
              >      Mar  5 11:20:33 psmdsana1501 kernel: Lustre:<br>
              >     
              7898:0:(client.c:2132:ptlrpc_expire_one_request()) @@@
              Request sent has<br>
              >      failed due to network error: [sent
              1551813633/real 1551813633]<br>
              >      req@ffff9fdcc2a95100 x1626845000222624/t0(0)<br>
              >      o104-><a
                href="mailto:ana15-MDT0000@172.21.52." target="_blank"
                moz-do-not-send="true">ana15-MDT0000@172.21.52.</a>87@o2ib:15/16
              lens 296/224 e 0 to 1 dl<br>
              >      1551813644 ref 1 fl Rpc:eX/2/ffffffff rc 0/-1<br>
              >      Mar  5 11:20:33 psmdsana1501 kernel: Lustre:<br>
              >     
              7898:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped
              23570550<br>
              >      previous similar messages<br>
              >      Mar  5 11:22:46 psmdsana1501 kernel:
              LustreError:<br>
              >     
              7898:0:(ldlm_lockd.c:682:ldlm_handle_ast_error()) ###
              client (nid<br>
              >      172.21.52.87@o2ib) failed to reply to blocking
              AST (req@ffff9fdcc2a95100<br>
              >      x1626845000222624 status 0 rc -110), evict it
              ns: mdt-ana15-MDT0000_UUID<br>
              >      lock: ffff9fde86ffdf80/0x9824623d2148f23a lrc:
              4/0,0 mode: PR/PR res:<br>
              >      [0x2000013ae:0x1:0x0].0x0 bits 0x13/0x0 rrc: 3
              type: IBT flags:<br>
              >      0x60200400000020 nid: 172.21.52.87@o2ib remote:
              0xd8efecd6e7621eb7<br>
              >      expref: 9 pid: 7898 timeout: 333665 lvb_type: 0<br>
              >      Mar  5 11:22:46 psmdsana1501 kernel:
              LustreError: 138-a: ana15-MDT0000:<br>
              >      A client on nid 172.21.52.87@o2ib was evicted
              due to a lock blocking<br>
              >      callback time out: rc -110<br>
              >      Mar  5 11:22:46 psmdsana1501 kernel:
              LustreError:<br>
              >      5321:0:(ldlm_lockd.c:256:expired_lock_main())
              ### lock callback timer<br>
              >      expired after 150s: evicting client at
              172.21.52.87@o2ib ns:<br>
              >      mdt-ana15-MDT0000_UUID lock:
              ffff9fde86ffdf80/0x9824623d2148f23a lrc:<br>
              >      3/0,0 mode: PR/PR res: [0x2000013ae:0x1:0x0].0x0
              bits 0x13/0x0 rrc: 3<br>
              >      type: IBT flags: 0x60200400000020 nid:
              172.21.52.87@o2ib remote:<br>
              >      0xd8efecd6e7621eb7 expref: 10 pid: 7898 timeout:
              0 lvb_type: 0<br>
              >      <br>
              >      <br>
              >      _______________________________________________<br>
              >      lustre-discuss mailing list<br>
              >      <a
                href="mailto:lustre-discuss@lists.lustre.org"
                target="_blank" moz-do-not-send="true">lustre-discuss@lists.lustre.org</a><br>
              >      <a
                href="http://lists.lustre.org/listinfo.cgi/lustre-discuss-lustre.org"
                rel="noreferrer" target="_blank" moz-do-not-send="true">http://lists.lustre.org/listinfo.cgi/lustre-discuss-lustre.org</a><br>
              >      <br>
              ><br>
              > _______________________________________________<br>
              > lustre-discuss mailing list<br>
              > <a href="mailto:lustre-discuss@lists.lustre.org"
                target="_blank" moz-do-not-send="true">lustre-discuss@lists.lustre.org</a><br>
              > <a
                href="http://lists.lustre.org/listinfo.cgi/lustre-discuss-lustre.org"
                rel="noreferrer" target="_blank" moz-do-not-send="true">http://lists.lustre.org/listinfo.cgi/lustre-discuss-lustre.org</a><br>
              <br>
              <br>
              _______________________________________________<br>
              lustre-discuss mailing list<br>
              <a href="mailto:lustre-discuss@lists.lustre.org"
                target="_blank" moz-do-not-send="true">lustre-discuss@lists.lustre.org</a><br>
              <a
                href="http://lists.lustre.org/listinfo.cgi/lustre-discuss-lustre.org"
                rel="noreferrer" target="_blank" moz-do-not-send="true">http://lists.lustre.org/listinfo.cgi/lustre-discuss-lustre.org</a><br>
            </blockquote>
          </div>
        </blockquote>
      </div>
    </blockquote>
    <p><br>
    </p>
  </body>
</html>