Improved device affinity handling to allow using CUDA_VISIBLE_DEVICES to handle device affinity.

jirikraus · jirikraus · commit 1636262e5d4f · 2022-02-23T09:53:07.000+01:00
diff --git a/mpi/jacobi.cpp b/mpi/jacobi.cpp
@@ -158,6 +158,8 @@ int main(int argc, char* argv[]) {
     MPI_CALL(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
     int size;
     MPI_CALL(MPI_Comm_size(MPI_COMM_WORLD, &size));
+    int num_devices = 0;
+    CUDA_RT_CALL(cudaGetDeviceCount(&num_devices));
 
     const int iter_max = get_argval<int>(argv, argv + argc, "-niter", 1000);
     const int nccheck = get_argval<int>(argv, argv + argc, "-nccheck", 1);
@@ -176,7 +178,7 @@ int main(int argc, char* argv[]) {
         MPI_CALL(MPI_Comm_free(&local_comm));
     }
 
-    CUDA_RT_CALL(cudaSetDevice(local_rank));
+    CUDA_RT_CALL(cudaSetDevice(local_rank%num_devices));
     CUDA_RT_CALL(cudaFree(0));
 
     real* a_ref_h;
diff --git a/mpi_overlap/jacobi.cpp b/mpi_overlap/jacobi.cpp
@@ -158,6 +158,8 @@ int main(int argc, char* argv[]) {
     MPI_CALL(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
     int size;
     MPI_CALL(MPI_Comm_size(MPI_COMM_WORLD, &size));
+    int num_devices = 0;
+    CUDA_RT_CALL(cudaGetDeviceCount(&num_devices));
 
     const int iter_max = get_argval<int>(argv, argv + argc, "-niter", 1000);
     const int nccheck = get_argval<int>(argv, argv + argc, "-nccheck", 1);
@@ -177,7 +179,7 @@ int main(int argc, char* argv[]) {
         MPI_CALL(MPI_Comm_free(&local_comm));
     }
 
-    CUDA_RT_CALL(cudaSetDevice(local_rank));
+    CUDA_RT_CALL(cudaSetDevice(local_rank%num_devices));
     CUDA_RT_CALL(cudaFree(0));
 
     real* a_ref_h;
diff --git a/nccl/jacobi.cpp b/nccl/jacobi.cpp
@@ -161,6 +161,8 @@ int main(int argc, char* argv[]) {
     MPI_CALL(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
     int size;
     MPI_CALL(MPI_Comm_size(MPI_COMM_WORLD, &size));
+    int num_devices = 0;
+    CUDA_RT_CALL(cudaGetDeviceCount(&num_devices));
 
     ncclUniqueId nccl_uid;
     if (rank == 0) NCCL_CALL(ncclGetUniqueId(&nccl_uid));
@@ -173,17 +175,29 @@ int main(int argc, char* argv[]) {
     const bool csv = get_arg(argv, argv + argc, "-csv");
 
     int local_rank = -1;
+    int local_size = 1;
     {
         MPI_Comm local_comm;
         MPI_CALL(MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL,
                                      &local_comm));
 
         MPI_CALL(MPI_Comm_rank(local_comm, &local_rank));
+        MPI_CALL(MPI_Comm_size(local_comm, &local_size));
 
         MPI_CALL(MPI_Comm_free(&local_comm));
     }
-
-    CUDA_RT_CALL(cudaSetDevice(local_rank));
+    if ( 1 < num_devices && num_devices < local_size )
+    {
+        fprintf(stderr,"ERROR Number of visible devices (%d) is less than number of ranks on the node (%d)!\n", num_devices, local_size);
+        MPI_CALL(MPI_Finalize());
+        return 1;
+    }
+    if ( 1 == num_devices ) {
+        // Only 1 device visbile assuming GPU affinity is handled via CUDA_VISIBLE_DEVICES
+        CUDA_RT_CALL(cudaSetDevice(0));
+    } else {
+        CUDA_RT_CALL(cudaSetDevice(local_rank));
+    }
     CUDA_RT_CALL(cudaFree(0));
 
     ncclComm_t nccl_comm;
diff --git a/nccl_overlap/jacobi.cpp b/nccl_overlap/jacobi.cpp
@@ -161,6 +161,8 @@ int main(int argc, char* argv[]) {
     MPI_CALL(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
     int size;
     MPI_CALL(MPI_Comm_size(MPI_COMM_WORLD, &size));
+    int num_devices = 0;
+    CUDA_RT_CALL(cudaGetDeviceCount(&num_devices));
 
     ncclUniqueId nccl_uid;
     if (rank == 0) NCCL_CALL(ncclGetUniqueId(&nccl_uid));
@@ -173,17 +175,29 @@ int main(int argc, char* argv[]) {
     const bool csv = get_arg(argv, argv + argc, "-csv");
 
     int local_rank = -1;
+    int local_size = 1;
     {
         MPI_Comm local_comm;
         MPI_CALL(MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL,
                                      &local_comm));
 
         MPI_CALL(MPI_Comm_rank(local_comm, &local_rank));
+        MPI_CALL(MPI_Comm_size(local_comm, &local_size));
 
         MPI_CALL(MPI_Comm_free(&local_comm));
     }
-
-    CUDA_RT_CALL(cudaSetDevice(local_rank));
+    if ( 1 < num_devices && num_devices < local_size )
+    {
+        fprintf(stderr,"ERROR Number of visible devices (%d) is less than number of ranks on the node (%d)!\n", num_devices, local_size);
+        MPI_CALL(MPI_Finalize());
+        return 1;
+    }
+    if ( 1 == num_devices ) {
+        // Only 1 device visbile assuming GPU affinity is handled via CUDA_VISIBLE_DEVICES
+        CUDA_RT_CALL(cudaSetDevice(0));
+    } else {
+        CUDA_RT_CALL(cudaSetDevice(local_rank));
+    }
     CUDA_RT_CALL(cudaFree(0));
 
     ncclComm_t nccl_comm;
diff --git a/nvshmem/jacobi.cu b/nvshmem/jacobi.cu
@@ -231,28 +231,23 @@ int main(int argc, char* argv[]) {
     int num_devices;
     CUDA_RT_CALL(cudaGetDeviceCount(&num_devices));
 
-    int local_rank = -1, local_size = 1;
+    int local_rank = -1;
+    int local_size = 1;
     {
         MPI_Comm local_comm;
-        MPI_Info info;
-        MPI_CALL(MPI_Info_create(&info));
-        MPI_CALL(
-            MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, info, &local_comm));
+        MPI_CALL(MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL,
+                                     &local_comm));
 
         MPI_CALL(MPI_Comm_rank(local_comm, &local_rank));
         MPI_CALL(MPI_Comm_size(local_comm, &local_size));
-        if (num_devices < local_size) {
-            fprintf(stderr,
-                    "ERROR: Number of devices is less numer of PEs \
-                    on the node!\n");
-            MPI_CALL(MPI_Comm_free(&local_comm));
-            MPI_CALL(MPI_Info_free(&info));
-            MPI_CALL(MPI_Finalize());
-            return -1;
-        }
 
         MPI_CALL(MPI_Comm_free(&local_comm));
-        MPI_CALL(MPI_Info_free(&info));
+    }
+    if ( num_devices < local_size )
+    {
+        fprintf(stderr,"ERROR Number of visible devices (%d) is less than number of ranks on the node (%d)!\n", num_devices, local_size);
+        MPI_CALL(MPI_Finalize());
+        return 1;
     }
     CUDA_RT_CALL(cudaSetDevice(local_rank));
     CUDA_RT_CALL(cudaFree(0));
diff --git a/nvshmem_opt/jacobi.cu b/nvshmem_opt/jacobi.cu
@@ -256,28 +256,23 @@ int main(int argc, char* argv[]) {
     int num_devices;
     CUDA_RT_CALL(cudaGetDeviceCount(&num_devices));
 
-    int local_rank = -1, local_size = 1;
+    int local_rank = -1;
+    int local_size = 1;
     {
         MPI_Comm local_comm;
-        MPI_Info info;
-        MPI_CALL(MPI_Info_create(&info));
-        MPI_CALL(
-            MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, info, &local_comm));
+        MPI_CALL(MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL,
+                                     &local_comm));
 
         MPI_CALL(MPI_Comm_rank(local_comm, &local_rank));
         MPI_CALL(MPI_Comm_size(local_comm, &local_size));
-        if (num_devices < local_size) {
-            fprintf(stderr,
-                    "ERROR: Number of devices is less numer of PEs \
-                    on the node!\n");
-            MPI_CALL(MPI_Comm_free(&local_comm));
-            MPI_CALL(MPI_Info_free(&info));
-            MPI_CALL(MPI_Finalize());
-            return -1;
-        }
 
         MPI_CALL(MPI_Comm_free(&local_comm));
-        MPI_CALL(MPI_Info_free(&info));
+    }
+    if ( num_devices < local_size )
+    {
+        fprintf(stderr,"ERROR Number of visible devices (%d) is less than number of ranks on the node (%d)!\n", num_devices, local_size);
+        MPI_CALL(MPI_Finalize());
+        return 1;
     }
     CUDA_RT_CALL(cudaSetDevice(local_rank));
     CUDA_RT_CALL(cudaFree(0));