[Lustre-discuss] o2ib cant ping/mount Infiniband NID

subbu kl subbukl at gmail.com
Tue Jan 27 02:19:43 PST 2009


Liang,

please find the info you have asked below.

There are two nodes MDS and OSS1 connected throgh a silverstorme Infiniband
switch and MDS running IB subnet manager running.

[root at MDS ~]# cat /etc/modprobe.conf
alias eth0 bnx2
alias eth1 bnx2
alias scsi_hostadapter megaraid_sas
alias scsi_hostadapter1 ata_piix
alias scsi_hostadapter2 usb-storage
alias ib0 ib_ipoib
alias ib1 ib_ipoib
alias net-pf-27 ib_sdp
options loop max_loop=64
options lnet networks=o2ib(ib0)
options ib_madeye data=1

[root at MDS ~]# ifconfig
eth0 Link encap:Ethernet HWaddr 00:18:8B:40:63:C3
  inet addr:172.24.198.128 Bcast:172.24.255.255 Mask:255.255.0.0
  inet6 addr: fe80::218:8bff:fe40:63c3/64 Scope:Link
  UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1
  RX packets:4203 errors:0 dropped:0 overruns:0 frame:0
  TX packets:1069 errors:0 dropped:0 overruns:0 carrier:0
  collisions:0 txqueuelen:1000
  RX bytes:415345 (405.6 KiB) TX bytes:109548 (106.9 KiB)
  Interrupt:169 Memory:f8000000-f8012100

ib0 Link encap:InfiniBand HWaddr
80:00:04:04:FE:80:00:00:00:00:00:00:00:00:00:00:00:00:00:00
  inet addr:172.24.198.140 Bcast:172.24.255.255 Mask:255.255.0.0
  inet6 addr: fe80::202:c902:22:cd49/64 Scope:Link
  UP BROADCAST RUNNING MULTICAST MTU:65520 Metric:1
  RX packets:8 errors:0 dropped:0 overruns:0 frame:0
  TX packets:11 errors:0 dropped:6 overruns:0 carrier:0
  collisions:0 txqueuelen:256
  RX bytes:4163 (4.0 KiB) TX bytes:4205 (4.1 KiB)

lo Link encap:Local Loopback
  inet addr:127.0.0.1 Mask:255.0.0.0
  inet6 addr: ::1/128 Scope:Host
  UP LOOPBACK RUNNING MTU:16436 Metric:1
  RX packets:1614 errors:0 dropped:0 overruns:0 frame:0
  TX packets:1614 errors:0 dropped:0 overruns:0 carrier:0
  collisions:0 txqueuelen:0
  RX bytes:5322452 (5.0 MiB) TX bytes:5322452 (5.0 MiB)

[root at MDS ~]# lctl list_nids
172.24.198.140 at o2ib
[root at MDS ~]# route -e
Kernel IP routing table
Destination Gateway Genmask Flags MSS Window irtt Iface
172.24.0.0 * 255.255.0.0 U 0 0 0 eth0
172.24.0.0 * 255.255.0.0 U 0 0 0 ib0
169.254.0.0 * 255.255.0.0 U 0 0 0 ib0
default 172.24.198.250 0.0.0.0 UG 0 0 0 eth0

[root at MDS ~]# echo +neterror > /proc/sys/lnet/printk

[root at MDS ~]# echo +neterror > /proc/sys/lnet/printk
[root at MDS ~]# lctl list_nids
172.24.198.140 at o2ib
[root at MDS ~]# lctl ping 172.24.198.140 at o2ib
12345-0 at lo
12345-172.24.198.140 at o2ib
[root at MDS ~]# lctl ping 172.24.198.141 at o2ib
failed to ping 172.24.198.141 at o2ib: Input/output error


/var/log/messages :

Jan 27 15:41:41 MDS kernel: Lustre:
5649:0:(o2iblnd_cb.c:2704:kiblnd_cm_callback()) 172.24.198.141 at o2ib: ROUTE
ERROR -22
Jan 27 15:41:41 MDS kernel: Lustre:
5649:0:(o2iblnd_cb.c:2118:kiblnd_peer_connect_failed()) Deleting messages
for 172.24.198.141 at o2ib: connection failed












[root at OSS1 ~]# cat /etc/modprobe.conf
alias eth0 e1000
alias eth1 e1000
alias scsi_hostadapter megaraid_mbox
alias scsi_hostadapter1 qla2xxx
options loop max_loop=64
alias ib0 ib_ipoib
options lnet networks=o2ib(ib0)
options ib_ipoib debug_level=1
options ib_ipoib mcast_debug_level=1
### BEGIN MPP Driver Comments ###
remove mppUpper if [ `ls -a /proc/mpp | wc -l` -gt 2 ]; then echo -e "Please
Unload Physical HBA Driver prior to unloading mppUpper."; else
/sbin/modprobe -r --ignore-remove mppUpper; fi
# Additional config info can be found in /opt/mpp/modprobe.conf.mppappend.
# The Above config info is needed if you want to make mkinitrd manually.
# Please read the Readme file that came with MPP driver for building RamDisk
manually.
# Edit the '/etc/modprobe.conf' file and run 'mppUpdate' to create Ramdisk
dynamically.
### END MPP Driver Comments ###
#alias ib1 ib_ipoib
alias net-pf-27 ib_sdp
options ib_madeye data=1

[root at OSS1 ~]# ifconfig
eth0 Link encap:Ethernet HWaddr 00:13:72:5D:3B:65
  inet addr:172.24.198.186 Bcast:172.24.255.255 Mask:255.255.0.0
  inet6 addr: fe80::213:72ff:fe5d:3b65/64 Scope:Link
  UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1
  RX packets:7831 errors:0 dropped:0 overruns:0 frame:0
  TX packets:1007 errors:0 dropped:0 overruns:0 carrier:0
  collisions:0 txqueuelen:100
  RX bytes:809440 (790.4 KiB) TX bytes:99439 (97.1 KiB)
  Base address:0xdcc0 Memory:df7e0000-df800000

ib0 Link encap:InfiniBand HWaddr
80:00:04:04:FE:80:00:00:00:00:00:00:00:00:00:00:00:00:00:00
  inet addr:172.24.198.141 Bcast:172.24.255.255 Mask:255.255.0.0
  inet6 addr: fe80::202:c902:21:550d/64 Scope:Link
  UP BROADCAST RUNNING MULTICAST MTU:65520 Metric:1
  RX packets:10 errors:0 dropped:0 overruns:0 frame:0
  TX packets:11 errors:0 dropped:7 overruns:0 carrier:0
  collisions:0 txqueuelen:256
  RX bytes:4097 (4.0 KiB) TX bytes:5202 (5.0 KiB)

lo Link encap:Local Loopback
  inet addr:127.0.0.1 Mask:255.0.0.0
  inet6 addr: ::1/128 Scope:Host
  UP LOOPBACK RUNNING MTU:16436 Metric:1
  RX packets:94 errors:0 dropped:0 overruns:0 frame:0
  TX packets:94 errors:0 dropped:0 overruns:0 carrier:0
  collisions:0 txqueuelen:0
  RX bytes:8962 (8.7 KiB) TX bytes:8962 (8.7 KiB)

virbr0 Link encap:Ethernet HWaddr 00:00:00:00:00:00
  inet addr:192.168.122.1 Bcast:192.168.122.255 Mask:255.255.255.0
  inet6 addr: fe80::200:ff:fe00:0/64 Scope:Link
  UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1
  RX packets:0 errors:0 dropped:0 overruns:0 frame:0
  TX packets:49 errors:0 dropped:0 overruns:0 carrier:0
  collisions:0 txqueuelen:0
  RX bytes:0 (0.0 b) TX bytes:9166 (8.9 KiB)

[root at OSS1 ~]# lctl list_nids
172.24.198.141 at o2ib
[root at OSS1 ~]# route -e
Kernel IP routing table
Destination Gateway Genmask Flags MSS Window irtt Iface
192.168.122.0 * 255.255.255.0 U 0 0 0 virbr0
172.24.0.0 * 255.255.0.0 U 0 0 0 eth0
172.24.0.0 * 255.255.0.0 U 0 0 0 ib0
169.254.0.0 * 255.255.0.0 U 0 0 0 ib0
default 172.24.198.250 0.0.0.0 UG 0 0 0 eth0

[root at OSS1 ~]# echo +neterror > /proc/sys/lnet/printk
[root at OSS1 ~]# echo +neterror > /proc/sys/lnet/printk
[root at OSS1 ~]# lctl list_nids
172.24.198.141 at o2ib
[root at OSS1 ~]# lctl ping 172.24.198.141 at o2ib
12345-0 at lo
12345-172.24.198.141 at o2ib
[root at OSS1 ~]# lctl ping 172.24.198.140 at o2ib
failed to ping 172.24.198.140 at o2ib: Input/output error


/var/log/messages :

Jan 27 15:34:17 OSS1 kernel: Lustre:
2776:0:(o2iblnd_cb.c:2704:kiblnd_cm_callback()) 172.24.198.140 at o2ib: ROUTE
ERROR -22
Jan 27 15:34:17 OSS1 kernel: Lustre:
2776:0:(o2iblnd_cb.c:2118:kiblnd_peer_connect_failed()) Deleting messages
for 172.24.198.140 at o2ib: connection failed



~subbu

On Sat, Jan 24, 2009 at 8:06 AM, Liang Zhen <Zhen.Liang at sun.com> wrote:

> Subbu,
> I think we can't see anything from tcpdump even run ping sucessfully,
> because we only need ipoib for connecting (not for transaction).
> I think we need these information for diagnosing:
> 1. modprobe.conf  of two nodes with IB
> 2. ifconfig on these two nodes
> 3. routing table on these two nodes
> 4. try lctl ping itself on both nodes and see if any error (with +neterror)
>
> Regards
> Liang
>
> subbu kl:
>
>>  problem remained same, when I run lctl ping with tcpdump 4.0.0 I dont
>> see any activity on ib0 !
>>
>> another exhaustive Lustre debug log I took with lctl ping do you see any
>> problem with it ?
>>
>> Jan 23 17:23:39 p186 kernel: Lustre:
>> 14294:0:(module.c:160:libcfs_psdev_open()) Process entered
>> Jan 23 17:23:39 p186 kernel: Lustre:
>> 14294:0:(module.c:164:libcfs_psdev_open()) kmalloced 'ldu': 8 at f5bc6620
>> (tot 7258558).
>> Jan 23 17:23:39 p186 kernel: Lustre:
>> 14294:0:(module.c:171:libcfs_psdev_open()) Process leaving (rc=0 : 0 : 0)
>> Jan 23 17:23:39 p186 kernel: Lustre: 14294:0:(module.c:228:libcfs_ioctl())
>> Process entered
>> Jan 23 17:23:39 p186 kernel: Lustre:
>> 14294:0:(linux-module.c:49:libcfs_ioctl_getdata()) Process entered
>> Jan 23 17:23:39 p186 kernel: Lustre:
>> 14294:0:(linux-module.c:90:libcfs_ioctl_getdata()) Process leaving (rc=0 : 0
>> : 0)
>> Jan 23 17:23:39 p186 kernel: Lustre: 14294:0:(api-ni.c:1223:LNetNIInit())
>> refs 1
>> Jan 23 17:23:39 p186 kernel: Lustre: 14294:0:(api-ni.c:1614:lnet_ping())
>> kmalloced 'info': 144 at f0b95880 (tot 7258702).
>> Jan 23 17:23:39 p186 kernel: Lustre:
>> 14294:0:(lib-lnet.h:251:lnet_eq_alloc()) kmalloced 'eq': 48 at efda1a00 (tot
>> 7258750).
>> Jan 23 17:23:39 p186 kernel: Lustre: 14294:0:(lib-eq.c:72:LNetEQAlloc())
>> kmalloced 'eq->eq_events': 240 at f0b95c80 (tot 7258990).
>> Jan 23 17:23:39 p186 kernel: Lustre:
>> 14294:0:(lib-lnet.h:279:lnet_md_alloc()) kmalloced 'md': 84 at ed16acc0 (tot
>> 7259074).
>> Jan 23 17:23:39 p186 kernel: Lustre:
>> 14294:0:(lib-lnet.h:327:lnet_msg_alloc()) kmalloced 'msg': 268 at f205a400
>> (tot 7259342).
>> Jan 23 17:23:39 p186 kernel: Lustre: 14294:0:(lib-move.c:2395:LNetGet())
>> LNetGet -> 12345-172.24.198.140 at o2ib
>> Jan 23 17:23:39 p186 kernel: Lustre:
>> 14294:0:(o2iblnd_cb.c:1531:kiblnd_send()) sending 0 bytes in 0 frags to
>> 12345-172.24.198.140 at o2ib
>> Jan 23 17:23:39 p186 kernel: Lustre:
>> 14294:0:(o2iblnd.c:312:kiblnd_create_peer()) kmalloced 'peer': 56 at
>> efda18c0 (tot 7259398).
>> Jan 23 17:23:39 p186 kernel: Lustre:
>> 14294:0:(o2iblnd_cb.c:1501:kiblnd_launch_tx()) peer[efda18c0] ->
>> 172.24.198.140 at o2ib (1)++
>> Jan 23 17:23:39 p186 kernel: Lustre:
>> 14294:0:(o2iblnd_cb.c:1380:kiblnd_connect_peer()) peer[efda18c0] ->
>> 172.24.198.140 at o2ib (2)++
>> Jan 23 17:23:39 p186 kernel: Lustre:
>> 14294:0:(o2iblnd_cb.c:1507:kiblnd_launch_tx()) peer[efda18c0] ->
>> 172.24.198.140 at o2ib (3)--
>> Jan 23 17:23:39 p186 kernel: Lustre: 14294:0:(lib-eq.c:209:LNetEQPoll())
>> Process entered
>> Jan 23 17:23:39 p186 kernel: Lustre:
>> 14294:0:(lib-eq.c:146:lib_get_event()) Process entered
>> Jan 23 17:23:39 p186 kernel: Lustre:
>> 14294:0:(lib-eq.c:149:lib_get_event()) event: f0b95cf8, sequence: 1,
>> eq->size: 2
>> Jan 23 17:23:39 p186 kernel: Lustre:
>> 14294:0:(lib-eq.c:152:lib_get_event()) Process leaving (rc=0 : 0 : 0)
>> Jan 23 17:23:39 p186 kernel: Lustre:
>> 2782:0:(o2iblnd_cb.c:2682:kiblnd_cm_callback()) 172.24.198.140 at o2ib Addr
>> resolved: 0
>> Jan 23 17:23:40 p186 kernel: Lustre:
>> 14294:0:(lib-eq.c:146:lib_get_event()) Process entered
>> Jan 23 17:23:40 p186 kernel: Lustre:
>> 14294:0:(lib-eq.c:149:lib_get_event()) event: f0b95cf8, sequence: 1,
>> eq->size: 2
>> Jan 23 17:23:40 p186 kernel: Lustre:
>> 14294:0:(lib-eq.c:152:lib_get_event()) Process leaving (rc=0 : 0 : 0)
>> Jan 23 17:23:40 p186 kernel: Lustre: 14294:0:(lib-eq.c:239:LNetEQPoll())
>> Process leaving (rc=0 : 0 : 0)
>> Jan 23 17:23:40 p186 kernel: Lustre: 14294:0:(api-ni.c:1665:lnet_ping())
>> poll 0(-1 -1)
>> Jan 23 17:23:40 p186 kernel: Lustre:
>> 14294:0:(lib-md.c:69:lnet_md_unlink()) Queueing unlink of md ed16acc0
>> Jan 23 17:23:40 p186 kernel: Lustre: 14294:0:(lib-eq.c:209:LNetEQPoll())
>> Process entered
>> Jan 23 17:23:40 p186 kernel: Lustre:
>> 14294:0:(lib-eq.c:146:lib_get_event()) Process entered
>> Jan 23 17:23:40 p186 kernel: Lustre:
>> 14294:0:(lib-eq.c:149:lib_get_event()) event: f0b95cf8, sequence: 1,
>> eq->size: 2
>> Jan 23 17:23:40 p186 kernel: Lustre:
>> 14294:0:(lib-eq.c:152:lib_get_event()) Process leaving (rc=0 : 0 : 0)
>> Jan 23 17:23:56 p186 kernel: Lustre:
>> 8276:0:(lvfs_lib.c:173:lprocfs_read_helper()) Process leaving (rc=4294962944
>> : -4352 : ffffef00)
>> Jan 23 17:23:56 p186 kernel: Lustre:
>> 8276:0:(lvfs_lib.c:173:lprocfs_read_helper()) Process leaving (rc=4294966784
>> : -512 : fffffe00)
>> Jan 23 17:23:56 p186 kernel: Lustre:
>> 8276:0:(lvfs_lib.c:173:lprocfs_read_helper()) Process leaving (rc=2817 :
>> 2817 : b01)
>> Jan 23 17:23:56 p186 kernel: Lustre:
>> 8276:0:(lvfs_lib.c:173:lprocfs_read_helper()) Process leaving (rc=2047 :
>> 2047 : 7ff)
>> Jan 23 17:23:56 p186 kernel: Lustre:
>> 8276:0:(lvfs_lib.c:173:lprocfs_read_helper()) Process leaving (rc=4294740832
>> : -226464 : fffc8b60)
>> Jan 23 17:23:56 p186 kernel: Lustre:
>> 8276:0:(lvfs_lib.c:173:lprocfs_read_helper()) Process leaving (rc=4286216485
>> : -8750811 : ff7a7925)
>> Jan 23 17:23:56 p186 kernel: Lustre:
>> 8276:0:(lvfs_lib.c:173:lprocfs_read_helper()) Process leaving (rc=5821091 :
>> 5821091 : 58d2a3)
>> Jan 23 17:23:56 p186 kernel: Lustre:
>> 8276:0:(lvfs_lib.c:173:lprocfs_read_helper()) Process leaving (rc=3356952 :
>> 3356952 : 333918)
>> Jan 23 17:23:56 p186 kernel: Lustre:
>> 8276:0:(pinger.c:193:ptlrpc_pinger_main()) next ping in 25000 (8510847)
>> Jan 23 17:24:21 p186 kernel: Lustre:
>> 8276:0:(lvfs_lib.c:173:lprocfs_read_helper()) Process leaving (rc=4294962944
>> : -4352 : ffffef00)
>> Jan 23 17:24:21 p186 kernel: Lustre:
>> 8276:0:(lvfs_lib.c:173:lprocfs_read_helper()) Process leaving (rc=4294966784
>> : -512 : fffffe00)
>> Jan 23 17:24:21 p186 kernel: Lustre:
>> 8276:0:(lvfs_lib.c:173:lprocfs_read_helper()) Process leaving (rc=2817 :
>> 2817 : b01)
>> Jan 23 17:24:21 p186 kernel: Lustre:
>> 8276:0:(lvfs_lib.c:173:lprocfs_read_helper()) Process leaving (rc=2047 :
>> 2047 : 7ff)
>> Jan 23 17:24:21 p186 kernel: Lustre:
>> 8276:0:(lvfs_lib.c:173:lprocfs_read_helper()) Process leaving (rc=4294740832
>> : -226464 : fffc8b60)
>> Jan 23 17:24:21 p186 kernel: Lustre:
>> 8276:0:(lvfs_lib.c:173:lprocfs_read_helper()) Process leaving (rc=4286216485
>> : -8750811 : ff7a7925)
>> Jan 23 17:24:21 p186 kernel: Lustre:
>> 8276:0:(lvfs_lib.c:173:lprocfs_read_helper()) Process leaving (rc=5821091 :
>> 5821091 : 58d2a3)
>> Jan 23 17:24:21 p186 kernel: Lustre:
>> 8276:0:(lvfs_lib.c:173:lprocfs_read_helper()) Process leaving (rc=3356952 :
>> 3356952 : 333918)
>> Jan 23 17:24:21 p186 kernel: Lustre:
>> 8276:0:(pinger.c:193:ptlrpc_pinger_main()) next ping in 25000 (8535847)
>> Jan 23 17:24:29 p186 kernel: Lustre:
>> 2794:0:(o2iblnd_cb.c:2704:kiblnd_cm_callback()) 172.24.198.140 at o2ib:
>> ROUTE ERROR -110
>> Jan 23 17:24:29 p186 kernel: Lustre:
>> 2794:0:(o2iblnd.c:422:kiblnd_unlink_peer_locked()) peer[efda18c0] ->
>> 172.24.198.140 at o2ib (2)--
>> Jan 23 17:24:29 p186 kernel: Lustre: 2794:0:(router.c:151:lnet_notify())
>> 172.24.198.141 at o2ib notifying 172.24.198.140 at o2ib: down
>> Jan 23 17:24:29 p186 kernel: Lustre:
>> 2794:0:(router.c:82:lnet_notify_locked()) Old news
>> Jan 23 17:24:29 p186 kernel: Lustre:
>> 2794:0:(o2iblnd_cb.c:2118:kiblnd_peer_connect_failed()) Deleting messages
>> for 172.24.198.140 at o2ib: connection failed
>> Jan 23 17:24:29 p186 kernel: Lustre: 2794:0:(lib-md.c:73:lnet_md_unlink())
>> Unlinking md ed16acc0
>> Jan 23 17:24:29 p186 kernel: Lustre:
>> 2794:0:(lib-lnet.h:301:lnet_md_free()) kfreed 'md': 84 at ed16acc0 (tot
>> 7259314).
>> Jan 23 17:24:29 p186 kernel: Lustre:
>> 2794:0:(lib-lnet.h:344:lnet_msg_free()) kfreed 'msg': 268 at f205a400 (tot
>> 7259046).
>> Jan 23 17:24:29 p186 kernel: Lustre:
>> 2794:0:(o2iblnd_cb.c:2706:kiblnd_cm_callback()) peer[efda18c0] ->
>> 172.24.198.140 at o2ib (1)--
>> Jan 23 17:24:29 p186 kernel: Lustre:
>> 2794:0:(o2iblnd.c:357:kiblnd_destroy_peer()) kfreed 'peer': 56 at efda18c0
>> (tot 7258990).
>> Jan 23 17:24:29 p186 kernel: Lustre:
>> 14294:0:(lib-eq.c:146:lib_get_event()) Process entered
>> Jan 23 17:24:29 p186 kernel: Lustre:
>> 14294:0:(lib-eq.c:149:lib_get_event()) event: f0b95cf8, sequence: 1,
>> eq->size: 2
>> Jan 23 17:24:29 p186 kernel: Lustre:
>> 14294:0:(lib-eq.c:170:lib_get_event()) Process leaving (rc=1 : 1 : 1)
>> Jan 23 17:24:29 p186 kernel: Lustre: 14294:0:(lib-eq.c:232:LNetEQPoll())
>> Process leaving (rc=1 : 1 : 1)
>> Jan 23 17:24:29 p186 kernel: Lustre: 14294:0:(api-ni.c:1665:lnet_ping())
>> poll 1(4 -113) unlinked
>> Jan 23 17:24:29 p186 kernel: Lustre:
>> 14294:0:(lib-lnet.h:259:lnet_eq_free()) kfreed 'eq': 48 at efda1a00 (tot
>> 7258942).
>> Jan 23 17:24:29 p186 kernel: Lustre: 14294:0:(lib-eq.c:135:LNetEQFree())
>> kfreed 'events': 240 at f0b95c80 (tot 7258702).
>> Jan 23 17:24:29 p186 kernel: Lustre: 14294:0:(api-ni.c:1772:lnet_ping())
>> kfreed 'info': 144 at f0b95880 (tot 7258558).
>> Jan 23 17:24:29 p186 kernel: Lustre: 14294:0:(module.c:336:libcfs_ioctl())
>> Process leaving (rc=4294967291 : -5 : fffffffb)
>> Jan 23 17:24:29 p186 kernel: Lustre:
>> 14294:0:(module.c:178:libcfs_psdev_release()) Process entered
>> Jan 23 17:24:29 p186 kernel: Lustre:
>> 14294:0:(module.c:183:libcfs_psdev_release()) kfreed 'ldu': 8 at f5bc6620
>> (tot 7258550).
>> Jan 23 17:24:29 p186 kernel: Lustre:
>> 14294:0:(module.c:187:libcfs_psdev_release()) Process leaving (rc=0 : 0 : 0)
>>
>> ~subbu
>>
>> On Fri, Jan 16, 2009 at 3:38 PM, subbu kl <subbukl at gmail.com <mailto:
>> subbukl at gmail.com>> wrote:
>>
>>    Liang,
>>
>>    Right; you reproduced the exact problem. But as you can see in my
>>    previous mail I think I have solved that problem by mannually
>>    assiging IP to ib0 (check this line # ifconfig ib0 172.24.198.111
>>    and *"Added LNI" lines  *)
>>
>>    we are back to sqare one now I guess ! LNET is up with mannually
>>    assigned IPs. normal ping succeds between machines but not lctl ping.
>>
>>    so my current problem is this :
>>
>>    # lctl ping 172.24.198.112 at o2ib
>>    failed to ping 172.24.198.112 at o2ib: Input/output error
>>
>>    /var/log/messages:
>>
>>
>>    Jan 16 10:24:14 p128 kernel: Lustre: 2750:0:(o2iblnd_cb.c:2687:
>>    kiblnd_cm_callback()) 172.24.198.112 at o2ib: ROUTE ERROR -22
>>    Jan 16 10:24:14 p128 kernel: Lustre:
>>    2750:0:(o2iblnd_cb.c:2101:kiblnd_peer_connect_failed()) Deleting
>>    messages for 172.24.198.112 at o2ib: connection failed
>>
>>    how can I get rid of this connection problem?
>>
>>    ~subbu
>>
>>
>>
>>    On Fri, Jan 16, 2009 at 2:11 PM, Liang Zhen <Zhen.Liang at sun.com
>>     <mailto:Zhen.Liang at sun.com>> wrote:
>>
>>        Subbu,
>>
>>        We don't have any tip for setup IPoIB, looks like linux can't
>>        find the ifaddr of ib0 on MDS(-99 is EADDRNOTAVAIL), so I
>>        think it's because you didn't assign any address to ib0 (or
>>        failed to assign address to ib0) before loading o2iblnd  in
>>        the first try.
>>        I can reproduce exactly same error by:
>>        1. modprobe ib_ipoib
>>        2. ifconfig ib0 up  // without assign any address
>>        3. modprobe ko2iblnd
>>        4. lctl network up
>>
>>        Regards
>>        Liang
>>
>>        subbu kl:
>>
>>            Liang,
>>            after executing following echo :
>>            echo +neterror > /proc/sys/lnet/printk
>>
>>            now lctlt ping shows the following error
>>
>>            # lctl ping 172.24.198.112 at o2ib
>>            failed to ping 172.24.198.112 at o2ib: Input/output error
>>
>>            Jan 16 10:24:14 p128 kernel: Lustre:
>>            2750:0:(o2iblnd_cb.c:2687:kiblnd_cm_callback())
>>            172.24.198.112 at o2ib: ROUTE ERROR -22
>>            Jan 16 10:24:14 p128 kernel: Lustre:
>>            2750:0:(o2iblnd_cb.c:2101:kiblnd_peer_connect_failed())
>>            Deleting messages for 172.24.198.112 at o2ib: connection failed
>>
>>            Looks like some problem with "IB connection manager" !
>>
>>            1. do we have any help docs to setup IPoIB and Lustre,
>>            lustre operation manual has very minimal info about this .
>>            I think I am missing some IPoIB setup part here.
>>            2. or is it mannual assignment of  IP addresses to "ib0"
>>            is creating some problem
>>
>>
>>            *Some more supporting info :
>>            *subnet manager of following version is also running :
>>            OpenSM 3.1.8
>>
>>            Initially I got this error for MDS mount
>>
>>            Jan 16 09:45:20 p128 kernel: LustreError:
>>            4991:0:(linux-tcpip.c:124:libcfs_ipif_query()) Can't get
>>            IP address for interface ib0
>>            Jan 16 09:45:20 p128 kernel: LustreError:
>>            4991:0:(o2iblnd.c:1563:kiblnd_startup()) Can't query IPoIB
>>            interface ib0: -99
>>            Jan 16 09:45:21 p128 kernel: LustreError: 105-4: Error
>>            -100 starting up LNI o2ib
>>            Jan 16 09:45:21 p128 kernel: LustreError:
>>            4991:0:(events.c:707:ptlrpc_init_portals()) network
>>            initialisation failed
>>            Jan 16 09:45:21 p128 modprobe: WARNING: Error inserting
>>            ptlrpc
>>
>>  (/lib/modules/2.6.18-53.1.14.el5_lustre.1.6.5.1smp/kernel/fs/lustre/ptlrpc.ko):
>>            Input/output error
>>            Jan 16 09:45:21 p128 modprobe: WARNING: Error inserting
>>            osc
>>
>>  (/lib/modules/2.6.18-53.1.14.el5_lustre.1.6.5.1smp/kernel/fs/lustre/osc.ko):
>>            Unknown symbol in module, or unknown parameter (see dmesg)
>>            Jan 16 09:45:21 p128 kernel: osc: Unknown symbol
>>            ldlm_prep_enqueue_req
>>            Jan 16 09:45:21 p128 kernel: osc: Unknown symbol
>>            ldlm_resource_get
>>            Jan 16 09:45:21 p128 kernel: osc: Unknown symbol
>>            ptlrpc_lprocfs_register_obd
>>            .
>>            .
>>            .
>>
>>            then I mannually set the IP address for ib0 as folows :
>>            # ifconfig ib0 172.24.198.111
>>
>>            [root at p186 ~]# ifconfig ib0
>>            ib0       Link encap:InfiniBand  HWaddr
>>            80:00:04:04:FE:80:00:00:00:00:00:00:00:00:00:00:00:00:00:00
>>                     inet addr:172.24.198.112  Bcast:172.24.255.255
>>             Mask:255.255.0.0
>>                     UP BROADCAST MULTICAST  MTU:65520  Metric:1
>>                     RX packets:0 errors:0 dropped:0 overruns:0 frame:0
>>                     TX packets:0 errors:0 dropped:0 overruns:0 carrier:0
>>                     collisions:0 txqueuelen:256
>>                     RX bytes:0 (0.0 b)  TX bytes:0 (0.0 b)
>>
>>            then it mounted sucessfully
>>
>>            *Jan 16 09:47:09 p128 kernel: Lustre: Added LNI
>>            172.24.198.111 at o2ib [8/64]
>>            Jan 16 09:47:09 p128 kernel: Lustre: MGS MGS started*
>>            Jan 16 09:47:09 p128 kernel: Lustre: Setting parameter
>>            lustre-MDT0000.mdt.group_upcall in log lustre-MDT0000
>>            Jan 16 09:47:09 p128 kernel: Lustre: Enabling user_xattr
>>            Jan 16 09:47:09 p128 kernel: Lustre: lustre-MDT0000: new
>>            disk, initializing
>>            Jan 16 09:47:09 p128 kernel: Lustre: MDT lustre-MDT0000
>>            now serving dev
>>            (lustre-MDT0000/64db1fc7-03ba-9803-4d20-ab0d2aa66116) with
>>            recovery enabled
>>            Jan 16 09:47:09 p128 kernel: Lustre:
>>            5274:0:(lproc_mds.c:262:lprocfs_wr_group_upcall())
>>            lustre-MDT0000: group upcall set to /usr/sbin/l_getgroups
>>            Jan 16 09:47:09 p128 kernel: Lustre: lustre-MDT0000.mdt:
>>            set parameter group_upcall=/usr/sbin/l_getgroups
>>            Jan 16 09:47:09 p128 kernel: Lustre: Server lustre-MDT0000
>>            on device /dev/loop0 has started
>>            .
>>            .
>>            .
>>
>>
>>            ~subbu
>>
>>
>>            On Thu, Jan 15, 2009 at 8:37 PM, Liang Zhen
>>            <Zhen.Liang at sun.com <mailto:Zhen.Liang at sun.com>
>>            <mailto:Zhen.Liang at sun.com <mailto:Zhen.Liang at sun.com>>>
>>            wrote:
>>
>>               Subbu,
>>
>>               I'd suggest:
>>               1) make sure ko2iblnd has been brought up (please check
>>            if there
>>               is any error message when startup ko2iblnd)
>>               2) echo +neterror > /proc/sys/lnet/printk, then try
>>            with lctl
>>               ping, if it still can't work please post error messages
>>
>>               Regards
>>               Liang
>>
>>               subbu kl:
>>
>>                   Problem is similer to
>>
>> http://lists.lustre.org/pipermail/lustre-discuss/2008-May/007498.html
>>                   But by looking at the thread could not really get
>>            the solution
>>                   for the problem.
>>
>>                   I have two RHEL5 Linux servers installed with
>>            following packages -
>>
>>                   kernel-lustre-smp-2.6.18-53.1.14.el5_lustre.1.6.5.1
>>                   kernel-ib-1.3-2.6.18_53.1.14.el5_lustre.1.6.5.1smp
>>
>> lustre-ldiskfs-3.0.4-2.6.18_53.1.14.el5_lustre.1.6.5.1smp
>>                   lustre-1.6.5.1-2.6.18_53.1.14.el5_lustre.1.6.5.1smp
>>
>> lustre-modules-1.6.5.1-2.6.18_53.1.14.el5_lustre.1.6.5.1smp
>>                   e2fsprogs-1.40.7.sun3-0redhat
>>
>>
>>                   machine 1: with ib0 IP address : 172.24.198.111
>>                   machine 2: with ib0 IP address : 172.24.198.112
>>
>>                   /etc/modprobe.conf contains
>>                   options lnet networks=o2ib
>>
>>                   TCP networking worked fine and now I am trying with
>>            Infiniband
>>                   network finding it difficult in communicating with
>>            IB nodes
>>                   mounting effort throghs me the following error
>>
>>                   [root at p186 ~]# mount -t lustre -o loop
>>            /tmp/lustre-ost1 /mnt/ost1
>>                   mount.lustre: mount /dev/loop0 at /mnt/ost1 failed:
>>                   Input/output error
>>                   Is the MGS running?
>>
>>                   /var/log/messages :
>>                   Jan 15 16:55:25 p186 kernel: kjournald starting.
>>             Commit
>>                   interval 5 seconds
>>                   Jan 15 16:55:25 p186 kernel: LDISKFS FS on loop0,
>>            internal journal
>>                   Jan 15 16:55:25 p186 kernel: LDISKFS-fs: mounted
>>            filesystem
>>                   with ordered data mode.
>>                   Jan 15 16:55:25 p186 kernel: kjournald starting.
>>             Commit
>>                   interval 5 seconds
>>                   Jan 15 16:55:25 p186 kernel: LDISKFS FS on loop0,
>>            internal journal
>>                   Jan 15 16:55:25 p186 kernel: LDISKFS-fs: mounted
>>            filesystem
>>                   with ordered data mode.
>>                   Jan 15 16:55:25 p186 kernel: LDISKFS-fs: file
>>            extents enabled
>>                   Jan 15 16:55:25 p186 kernel: LDISKFS-fs: mballoc
>>            enabled
>>                   Jan 15 16:55:30 p186 kernel: Lustre: Request x7
>>            sent from
>>                   MGC172.24.198.111 at o2ib to NID 172.24.198.111 at o2ib
>>            5s ago has
>>                   timed out (limit 5s).
>>                   Jan 15 16:55:30 p186 kernel: LustreError:
>>                   7193:0:(obd_mount.c:1062:server_start_targets())
>>            Required
>>                   registration failed for lustre-OSTffff: -5
>>                   Jan 15 16:55:30 p186 kernel: LustreError: 15f-b:
>>            Communication
>>                   error with the MGS.  Is the MGS running?
>>                   Jan 15 16:55:30 p186 kernel: LustreError:
>>                   7193:0:(obd_mount.c:1597:server_fill_super())
>>            Unable to start
>>                   targets: -5
>>                   Jan 15 16:55:30 p186 kernel: LustreError:
>>                   7193:0:(obd_mount.c:1382:server_put_super()) no obd
>>            lustre-OSTffff
>>                   Jan 15 16:55:30 p186 kernel: LustreError:
>>                   7193:0:(obd_mount.c:119:server_deregister_mount())
>>                   lustre-OSTffff not registered
>>                   Jan 15 16:55:30 p186 kernel: LDISKFS-fs: mballoc: 0
>>            blocks 0
>>                   reqs (0 success)
>>                   Jan 15 16:55:30 p186 kernel: LDISKFS-fs: mballoc: 0
>>            extents
>>                   scanned, 0 goal hits, 0 2^N hits, 0 breaks, 0 lost
>>                   Jan 15 16:55:30 p186 kernel: LDISKFS-fs: mballoc: 0
>>            generated
>>                   and it took 0
>>                   Jan 15 16:55:30 p186 kernel: LDISKFS-fs: mballoc: 0
>>                   preallocated, 0 discarded
>>                   Jan 15 16:55:30 p186 kernel: Lustre: server umount
>>                   lustre-OSTffff complete
>>                   Jan 15 16:55:30 p186 kernel: LustreError:
>>                   7193:0:(obd_mount.c:1951:lustre_fill_super())
>>            Unable to mount
>>                    (-5)
>>
>>                   All pinging efforts also failed to the IB NIDS
>>            local/remote
>>                   can ping the ip address :
>>                   [root at p186 ~]# ping 172.24.198.112
>>                   PING 172.24.198.112 (172.24.198.112) 56(84) bytes
>>            of data.
>>                   64 bytes from 172.24.198.112 <http://172.24.198.112>:
>>                   icmp_seq=1 ttl=64 time=0.052 ms
>>                   64 bytes from 172.24.198.112 <http://172.24.198.112>:
>>                   icmp_seq=2 ttl=64 time=0.024 ms
>>
>>
>>                   --- 172.24.198.112 ping statistics ---
>>                   2 packets transmitted, 2 received, 0% packet loss,
>>            time 1000ms
>>                   rtt min/avg/max/mdev = 0.024/0.038/0.052/0.014 ms
>>                   [root at p186 ~]# ping 172.24.198.111
>>                   PING 172.24.198.111 (172.24.198.111) 56(84) bytes
>>            of data.
>>                   64 bytes from 172.24.198.111 <http://172.24.198.111>:
>>                   icmp_seq=1 ttl=64 time=2.16 ms
>>                   64 bytes from 172.24.198.111 <http://172.24.198.111>:
>>                   icmp_seq=2 ttl=64 time=0.296 ms
>>
>>
>>                   --- 172.24.198.111 ping statistics ---
>>                   2 packets transmitted, 2 received, 0% packet loss,
>>            time 1000ms
>>                   rtt min/avg/max/mdev = 0.296/1.231/2.166/0.935 ms
>>
>>                   but cant ping the NIDS :
>>                   [root at p186 ~]# lctl ping 172.24.198.112 at o2ib
>>                   failed to ping 172.24.198.112 at o2ib: Input/output error
>>                   [root at p186 ~]# lctl ping 172.24.198.111 at o2ib
>>                   failed to ping 172.24.198.111 at o2ib: Input/output error
>>
>>                   Any idea why lnet cant ping NIDS ?
>>
>>                   some more configurations:
>>                   [root at p186 ~]# ibstat
>>                   CA 'mthca0'
>>                          CA type: MT23108
>>                          Number of ports: 2
>>                          Firmware version: 3.5.0
>>                          Hardware version: a1
>>                          Node GUID: 0x0002c9020021550c
>>
>>                   Machines are connected via IB switch.
>>
>>                   Looking forward for help.
>>
>>                   ~subbu
>>
>> ------------------------------------------------------------------------
>>
>>                   _______________________________________________
>>                   Lustre-discuss mailing list
>>                   Lustre-discuss at lists.lustre.org
>>            <mailto:Lustre-discuss at lists.lustre.org>
>>                   <mailto:Lustre-discuss at lists.lustre.org
>>            <mailto:Lustre-discuss at lists.lustre.org>>
>>
>>                   http://lists.lustre.org/mailman/listinfo/lustre-discuss
>>
>>
>>
>>
>>            --            . . . s u b b u
>>            "You've got to be original, because if you're like someone
>>            else, what do they need you for?"
>>
>>  ------------------------------------------------------------------------
>>
>>            _______________________________________________
>>            Lustre-discuss mailing list
>>            Lustre-discuss at lists.lustre.org
>>            <mailto:Lustre-discuss at lists.lustre.org>
>>            http://lists.lustre.org/mailman/listinfo/lustre-discuss
>>
>>
>>
>>
>>
>>    --    . . . s u b b u
>>    "You've got to be original, because if you're like someone else,
>>    what do they need you for?"
>>
>>
>>
>>
>> --
>> . . . s u b b u
>> "You've got to be original, because if you're like someone else, what do
>> they need you for?"
>> ------------------------------------------------------------------------
>>
>> _______________________________________________
>> Lustre-discuss mailing list
>> Lustre-discuss at lists.lustre.org
>> http://lists.lustre.org/mailman/listinfo/lustre-discuss
>>
>>
>
>


-- 
. . . s u b b u
"You've got to be original, because if you're like someone else, what do
they need you for?"
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.lustre.org/pipermail/lustre-discuss-lustre.org/attachments/20090127/e85bdf80/attachment.htm>


More information about the lustre-discuss mailing list