2

I am trying to get started on dynamic process creation in MPI. I have a parent code (main.c) trying to spawn new worker/child processes (worker.c) and merge both into one intracommunicator. The parent code (main.c) is

#include<stdio.h>
#include "mpi.h"

MPI_Comm child_comm;
int rank, size;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);

if(rank == 0 )
{
   int  num_processes_to_spawn = 2;
   MPI_Comm_spawn("worker", MPI_ARGV_NULL, num_processes_to_spawn, MPI_INFO_NULL, 0, MPI_COMM_SELF, &child_comm, MPI_ERRCODES_IGNORE );

MPI_Comm intra_comm;
MPI_Intercomm_merge(child_comm,0, &intra_comm);
MPI_Barrier(child_comm);


int tmp_size;
MPI_Comm_size(intra_comm, &tmp_size);
printf("size of intra comm world = %d\n", tmp_size);

MPI_Comm_size(child_comm, &tmp_size);
printf("size of child comm world = %d\n", tmp_size);

MPI_Comm_size(MPI_COMM_WORLD, &tmp_size);
printf("size of parent comm world = %d\n", tmp_size);

}

MPI_Finalize();

The worker (child) code is:

    #include<stdio.h> 
    #include "mpi.h"
    int main( int argc, char *argv[] )
    {
    int numprocs, myrank;
    MPI_Comm parentcomm;
    MPI_Comm intra_comm;

    MPI_Init( &argc, &argv );
    MPI_Comm_size( MPI_COMM_WORLD, &numprocs );
    MPI_Comm_rank( MPI_COMM_WORLD, &myrank );

    MPI_Comm_get_parent( &parentcomm );

    MPI_Intercomm_merge(parentcomm, 1, &intra_comm);
    MPI_Barrier(parentcomm);

    if(myrank == 0)
    {
    int tmp_size;
    MPI_Comm_size(parentcomm, &tmp_size);
    printf("child size of parent comm world = %d\n", tmp_size);

    MPI_Comm_size(MPI_COMM_WORLD, &tmp_size);
    printf("child size of child comm world = %d\n", tmp_size);

    MPI_Comm_size(intra_comm, &tmp_size);
    printf("child size of intra comm world = %d\n", tmp_size);

    MPI_Finalize( );
    return 0;
  }
 } 

I run this code using

mpirun -np 12 main.c

After split and merge, I expect the output as

size of intra comm world = 14
size of child comm world = 2
size of parent comm world = 12
child size of parent comm world = 12
child size of child comm world = 2
child size of intra comm world = 14

But I get the following incorrect output.

   size of intra comm world = 3
    size of child comm world = 1
    size of parent comm world = 12
    child size of parent comm world = 2
    child size of child comm world = 2
    child size of intra comm world = 3

I do not understand where the mistake it, could kindly someone let me know where the mistake is.

Thanks, Kris

1 Answer 1

1

Your code suffers from a few problems, which I'll try to list here:

  • In the master part, only process 0 calls MPI_Comm_spawn(). This isn't a mistake as such (especially since you use MPI_COMM_SELF as parent communicator), but it de facto excludes all other processes from the subsequent merging.
  • In both the master and worker parts, you use MPI_Comm_size() to get the size of the remote communicator instead of MPI_Comm_remote_size(). Therefore you will only get the size of the local communicator inside the inter-communicator, instead of the size of the remote communicator.
  • In the master code, only process 0 calls MPI_Finalise() (not to mention that main() and MPI_Init() are missing)

Here are some fixed version of your codes:

master.c

#include <stdio.h>
#include <mpi.h>

int main( int argc, char *argv[] ) {

    MPI_Init( &argc, &argv );
    int rank;
    MPI_Comm_rank( MPI_COMM_WORLD, &rank );

    MPI_Comm child_comm;
    int  num_processes_to_spawn = 2;
    MPI_Comm_spawn( "./worker", MPI_ARGV_NULL,
                    num_processes_to_spawn, MPI_INFO_NULL,
                    0, MPI_COMM_WORLD,
                    &child_comm, MPI_ERRCODES_IGNORE );

    MPI_Comm intra_comm;
    MPI_Intercomm_merge( child_comm, 0, &intra_comm );

    if ( rank == 0 ) {
        int tmp_size;
        MPI_Comm_size( intra_comm, &tmp_size );
        printf( "size of intra comm world = %d\n", tmp_size );

        MPI_Comm_remote_size( child_comm, &tmp_size );
        printf( "size of child comm world = %d\n", tmp_size );

        MPI_Comm_size( MPI_COMM_WORLD, &tmp_size );
        printf( "size of parent comm world = %d\n", tmp_size );
    }

    MPI_Finalize();

    return 0;
}

worker.c

#include <stdio.h> 
#include <mpi.h>

int main( int argc, char *argv[] ) {

    MPI_Init( &argc, &argv );

    int myrank;
    MPI_Comm_rank( MPI_COMM_WORLD, &myrank );

    MPI_Comm parentcomm;
    MPI_Comm_get_parent( &parentcomm );

    MPI_Comm intra_comm;
    MPI_Intercomm_merge( parentcomm, 1, &intra_comm );

    if ( myrank == 0 ) {
        int tmp_size;
        MPI_Comm_remote_size( parentcomm, &tmp_size );
        printf( "child size of parent comm world = %d\n", tmp_size );

        MPI_Comm_size( MPI_COMM_WORLD, &tmp_size );
        printf( "child size of child comm world = %d\n", tmp_size );

        MPI_Comm_size( intra_comm, &tmp_size );
        printf( "child size of intra comm world = %d\n", tmp_size );
    }

    MPI_Finalize();

    return 0;
}

Which gives on my laptop:

~> mpirun -n 12 ./master
child size of parent comm world = 12
child size of child comm world = 2
child size of intra comm world = 14
size of intra comm world = 14
size of child comm world = 2
size of parent comm world = 12
Sign up to request clarification or add additional context in comments.

1 Comment

Thanks Gilles. I realized that it was an issue of remote group size;

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.