1

we are trying to connect two gpus located on two servers via RDMA and infinibands. The GPUs are Nvidia RTX 6000 Ada and the infinbands are NVIDIA ConnectX-6.

Server configuration

Our server has the configuration of the image, where we have the GPU connected in slot 2 (although it occupies slot 1 and 2) and the connectX in slot 3. By looking at the connection between the infiniband and the GPU (terminal command nvidia-smi topo -m) you can see that the connection is NODE.

Terminal output Terminal output of nvidia-smi topo -m

According to the web page: NVIDIA Configuration | Juniper Networks, this causes a bad performance, but due to the layout of our server, it is not possible to move the gpu nor the connect X.

We have programmed two scripts in Python, one for the sending server and one for the receiving server.

The code for the server that sends data is the following

import socket
import cupy as cp
import ctypes
from pyverbs.device import Context
from pyverbs.pd import PD
from pyverbs.cq import CQ
from pyverbs.qp import QPCap, QPInitAttr, QPAttr, QP
from pyverbs.mr import MR
from pyverbs.enums import *
from pyverbs.addr import AHAttr
from pyverbs import device as d
from pyverbs.wr import SGE, SendWR
import ctypes

# Setup RDMA device
lst = d.get_device_list()
ctx = Context(name=lst[0].name.decode())
pd = PD(ctx)
cq = CQ(ctx, 10)

# Allocate GPU buffer with CuPy
gpu_buf = cp.arange(1, 11, dtype=cp.uint8)  # [1,2,3,...,10] on GPU
gpu_ptr = gpu_buf.data.ptr
#gpu_ptr = ctypes.c_uint64(gpu_buf.data.ptr).value

# Register GPU memory
mr = MR(creator=pd, length=gpu_buf.nbytes, address=gpu_ptr,
         access=IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE  | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_ON_DEMAND | IBV_ACCESS_MW_BIND  )

# Setup QP
cap = QPCap(max_send_wr=10, max_recv_wr=10, max_send_sge=1, max_recv_sge=1)
init_attr = QPInitAttr(qp_type=IBV_QPT_RC, scq=cq, rcq=cq, cap=cap)
qp = QP(pd, init_attr)

# Exchange info
port_attr = ctx.query_port(1)
lid = port_attr.lid
psn = 0

s = socket.socket()
s.connect(('192.168.2.5', 18515))  # Receiver IP
s.send(f"{lid},{qp.qp_num},{psn},{mr.rkey},{hex(mr.buf)},".encode())
remote_info = s.recv(1024).decode().split(',')
remote_lid, remote_qpn, remote_psn, remote_rkey, remote_addr = int(remote_info[0]), int(remote_info[1]), int(remote_info[2]), int(remote_info[3]), int(remote_info[4], 16)

# QP state transitions
attr = QPAttr()
attr.qp_state = IBV_QPS_INIT
attr.pkey_index = 0
attr.port_num = 1
attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_ON_DEMAND | IBV_ACCESS_MW_BIND
qp.modify(attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)

attr = QPAttr()
attr.qp_state = IBV_QPS_RTR
attr.path_mtu = 5
attr.dest_qp_num = remote_qpn
attr.rq_psn = remote_psn
attr.max_dest_rd_atomic = 1
attr.min_rnr_timer = 12
attr.ah_attr = AHAttr(port_num=1, dlid=remote_lid)
qp.modify(attr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU |
                 IBV_QP_DEST_QPN | IBV_QP_RQ_PSN |
                 IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER)

attr = QPAttr()
attr.qp_state = IBV_QPS_RTS
attr.timeout = 14
attr.retry_cnt = 7
attr.rnr_retry = 7
attr.sq_psn = psn
attr.max_rd_atomic = 1
qp.modify(attr, IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT |
                 IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC)

# Post RDMA write
sge = SGE(addr=mr.buf, length=mr.length, lkey=mr.lkey)
wr = SendWR(
    wr_id=1,
    num_sge=1,
    sg=[sge],
    opcode=IBV_WR_RDMA_WRITE,
    send_flags=IBV_SEND_SIGNALED
)
wr.set_wr_rdma(rkey=remote_rkey, addr=remote_addr)

ret = qp.post_send(wr)
if ret != 0:
    print("Failed to post RDMA write!")
else:
    print("RDMA write posted.")

wc = cq.poll(num_entries=1)
for _ in range(1000000):
    if wc[0] > 0 :
        print("RDMA write completed successfully.")
        break
    else:
        print("RDMA write failed or no completion.")

print("Sender GPU buffer sent:", gpu_buf.get())  # Copy to host for printing
s.close()

The receiver’s code follows the same structure:

import socket
import cupy as cp
import ctypes
import time
from pyverbs.device import Context
from pyverbs.pd import PD
from pyverbs.cq import CQ
from pyverbs.qp import QPCap, QPInitAttr, QPAttr, QP
from pyverbs.mr import MR
from pyverbs.enums import *
from pyverbs.addr import AHAttr
from pyverbs import device as d

# Setup RDMA device
lst = d.get_device_list()
ctx = Context(name=lst[0].name.decode())
pd = PD(ctx)
cq = CQ(ctx, 10)

# Allocate GPU buffer
gpu_buf = cp.zeros(10, dtype=cp.uint8)
gpu_ptr = gpu_buf.data.ptr

# Register GPU memory
mr = MR(creator=pd, length=gpu_buf.nbytes, address=gpu_ptr,
        access=IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE  | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_ON_DEMAND | IBV_ACCESS_MW_BIND)

# Setup QP
cap = QPCap(max_send_wr=10, max_recv_wr=10, max_send_sge=1, max_recv_sge=1)
init_attr = QPInitAttr(qp_type=IBV_QPT_RC, scq=cq, rcq=cq, cap=cap)
qp = QP(pd, init_attr)

# Exchange info
port_attr = ctx.query_port(1)
lid = port_attr.lid
psn = 0

s = socket.socket()
s.bind(('', 18515))
s.listen(1)
conn, _ = s.accept()
remote_info = conn.recv(1024).decode().split(',')
remote_lid, remote_qpn, remote_psn, remote_rkey, remote_addr = int(remote_info[0]), int(remote_info[1]), int(
    remote_info[2]), int(remote_info[3]), int(remote_info[4], 16)
conn.send(f"{lid},{qp.qp_num},{psn},{mr.rkey},{hex(mr.buf)},".encode())

# QP state transitions
attr = QPAttr()
attr.qp_state = IBV_QPS_INIT
attr.pkey_index = 0
attr.port_num = 1
attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_ON_DEMAND | IBV_ACCESS_MW_BIND
qp.modify(attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)

attr = QPAttr()
attr.qp_state = IBV_QPS_RTR
attr.path_mtu = 5
attr.dest_qp_num = remote_qpn
attr.rq_psn = remote_psn
attr.max_dest_rd_atomic = 1
attr.min_rnr_timer = 12
attr.ah_attr = AHAttr(port_num=1, dlid=remote_lid)
qp.modify(attr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU |
          IBV_QP_DEST_QPN | IBV_QP_RQ_PSN |
          IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER)

attr = QPAttr()
attr.qp_state = IBV_QPS_RTS
attr.timeout = 14
attr.retry_cnt = 7
attr.rnr_retry = 7
attr.sq_psn = psn
attr.max_rd_atomic = 1
qp.modify(attr, IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT |
          IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC)

print('Receiver ready. Waiting for RDMA write...')
time.sleep(5)  # Wait for sender

print('Receiver GPU buffer after RDMA write:', gpu_buf.get())  # Copy to host for printing
s.close()

but we can’t see the changes in the message sent on the receiver’s side. Is it possible to make the connection between them despite having a NODE connection type?

On the other hand, we are not sure if we have the nvidia-peermem kernel enabled correctly and if this may be affecting the transfer.

Thank you very much

I'm trying to connect two gpus in two servers with rdma and connectX6 to avoid the transfer with the CPU and obtain a lower latency.

1
  • Were you able to successfully run ib_write_bw with the --cuda flag between the two GPUs? Commented Aug 11 at 12:59

0

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.