▶ 《並行程序設計導論》第六章中討論了 n 體問題,分別使用了 MPI,Pthreads,OpenMP 來進行實現,這裏是 MPI 的代碼,分爲基本算法和簡化算法(引力計算量爲基本算法的一半,可是消息傳遞較爲複雜)算法
● 基本算法數組
1 //mpi_nbody_basic.c,MPI 基本算法 2 #include <stdio.h> 3 #include <stdlib.h> 4 #include <string.h> 5 #include <math.h> 6 #include <mpi.h> 7 8 #define OUTPUT // 要求輸出結果 9 #define DEBUG // debug 模式,每一個函數給出更多的輸出 10 #define DIM 2 // 二維繫統 11 #define X 0 // X 座標 12 #define Y 1 // Y 座標 13 typedef double vect_t[DIM]; // 向量數據類型 14 const double G = 6.673e-11; // 萬有引力常量 15 int my_rank, comm_sz; // 進程編號和總進程數 16 MPI_Datatype vect_mpi_t; // 使用的派生數據類型 17 vect_t *vel = NULL; // 全局顆粒速度,用於 0 號進程的輸出 18 19 void Usage(char* prog_name)// 輸入說明 20 { 21 fprintf(stderr, "usage: mpiexec -n <nProcesses> %s\n", prog_name); 22 fprintf(stderr, "<nParticle> <nTimestep> <sizeTimestep> <outputFrequency> <g|i>\n"); 23 fprintf(stderr, " 'g': inite condition by random\n"); 24 fprintf(stderr, " 'i': inite condition from stdin\n"); 25 exit(0); 26 } 27 28 void Get_args(int argc, char* argv[], int* n_p, int* n_steps_p, double* delta_t_p, int* output_freq_p, char* g_i_p)// 獲取參數信息 29 { // 全部進程均調用該函數,由於有集合通訊,但只有 0 號進程處理參數 30 if (my_rank == 0) 31 { 32 if (argc != 6) 33 Usage(argv[0]); 34 *n_p = strtol(argv[1], NULL, 10); 35 *n_steps_p = strtol(argv[2], NULL, 10); 36 *delta_t_p = strtod(argv[3], NULL); 37 *output_freq_p = strtol(argv[4], NULL, 10); 38 *g_i_p = argv[5][0]; 39 if (*n_p <= 0 || *n_p % comm_sz || *n_steps_p < 0 || *delta_t_p <= 0 || *g_i_p != 'g' && *g_i_p != 'i')// 不合要求的輸入狀況 40 { 41 printf("haha\n"); 42 if (my_rank == 0) 43 Usage(argv[0]); 44 MPI_Finalize(); 45 exit(0); 46 } 47 } 48 MPI_Bcast(n_p, 1, MPI_INT, 0, MPI_COMM_WORLD); 49 MPI_Bcast(n_steps_p, 1, MPI_INT, 0, MPI_COMM_WORLD); 50 MPI_Bcast(delta_t_p, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); 51 MPI_Bcast(output_freq_p, 1, MPI_INT, 0, MPI_COMM_WORLD); 52 MPI_Bcast(g_i_p, 1, MPI_CHAR, 0, MPI_COMM_WORLD); 53 # ifdef DEBUG// 確認各進程中的參數狀況 54 printf("Get_args, rank%2d n %d n_steps %d delta_t %e output_freq %d g_i %c\n", 55 my_rank, *n_p, *n_steps_p, *delta_t_p, *output_freq_p, *g_i_p); 56 fflush(stdout); 57 # endif 58 } 59 60 void Gen_init_cond(double masses[], vect_t pos[], vect_t loc_vel[], int n, int loc_n)// 自動生成初始條件,全部進程均調用該函數,由於有集合通訊 61 { // 生成的顆粒位於原點和 X 正半軸,速度大小相等,方向平行於 Y 軸,交錯向上下 62 const double mass = 5.0e24, gap = 1.0e5, speed = 3.0e4;// 使用了地球的質量和公轉速度 63 if (my_rank == 0) 64 { 65 // srand(2);// 使用隨機方向和速度大小,下同 66 for (int i = 0; i < n; i++) 67 { 68 masses[i] = mass; 69 pos[i][X] = i * gap; 70 pos[i][Y] = 0.0; 71 vel[i][X] = 0.0; 72 // vel[i][Y] = speed * (2 * rand() / (double)RAND_MAX) - 1); 73 vel[i][Y] = (i % 2) ? -speed : speed; 74 } 75 } 76 // 同步質量,位置信息,分發速度信息 77 MPI_Bcast(masses, n, MPI_DOUBLE, 0, MPI_COMM_WORLD); 78 MPI_Bcast(pos, n, vect_mpi_t, 0, MPI_COMM_WORLD); 79 MPI_Scatter(vel, loc_n, vect_mpi_t, loc_vel, loc_n, vect_mpi_t, 0, MPI_COMM_WORLD); 80 # ifdef DEBUG// 確認各進程中第一個顆粒的初始條件 81 printf("Gen_init_cond, rank%2d %10.3e %10.3e %10.3e %10.3e %10.3e\n", 82 my_rank, masses[0], pos[0][X], pos[0][Y], loc_vel[0][X], loc_vel[0][Y]); 83 fflush(stdout); 84 # endif 85 } 86 87 void Get_init_cond(double masses[], vect_t pos[], vect_t loc_vel[], int n, int loc_n)// 手工輸入初始條件,相似函數 Gen_init_cond() 88 { 89 if (my_rank == 0) 90 { 91 printf("For each particle, enter (in order): mass x-coord y-coord x-velocity y-velocity\n"); 92 for (int i = 0; i < n; i++) 93 { 94 scanf_s("%lf", &masses[i]); 95 scanf_s("%lf", &pos[i][X]); 96 scanf_s("%lf", &pos[i][Y]); 97 scanf_s("%lf", &vel[i][X]); 98 scanf_s("%lf", &vel[i][Y]); 99 } 100 } 101 MPI_Bcast(masses, n, MPI_DOUBLE, 0, MPI_COMM_WORLD); 102 MPI_Bcast(pos, n, vect_mpi_t, 0, MPI_COMM_WORLD); 103 MPI_Scatter(vel, loc_n, vect_mpi_t, loc_vel, loc_n, vect_mpi_t, 0, MPI_COMM_WORLD); 104 # ifdef DEBUG 105 printf("Get_init_cond, rank%2d %10.3e %10.3e %10.3e %10.3e %10.3e\n", 106 my_rank, masses[0], pos[0][X], pos[0][Y], loc_vel[0][X], loc_vel[0][Y]); 107 fflush(stdout); 108 # endif 109 } 110 111 void Output_state(double time, double masses[], vect_t pos[], vect_t loc_vel[], int n, int loc_n)// 輸出當前狀態 112 { 113 MPI_Gather(loc_vel, loc_n, vect_mpi_t, vel, loc_n, vect_mpi_t, 0, MPI_COMM_WORLD);// 從各進程彙集速度信息用於輸出 114 if (my_rank == 0) 115 { 116 printf("Output_state, time = %.2f\n", time); 117 for (int i = 0; i < n; i++) 118 printf(" %2d %10.3e %10.3e %10.3e %10.3e %10.3e\n", i, masses[i], pos[i][X], pos[i][Y], vel[i][X], vel[i][Y]); 119 printf("\n"); 120 fflush(stdout); 121 } 122 } 123 124 void Compute_force(int loc_part, double masses[], vect_t loc_forces[], vect_t pos[], int n, int loc_n)// 計算顆粒 part 受到的萬有引力 125 { 126 const int part = my_rank * loc_n + loc_part;// 將局部顆粒編號轉化爲全局顆粒編號 127 int k; 128 vect_t f_part_k; 129 double len, fact; 130 for (loc_forces[loc_part][X] = loc_forces[loc_part][Y] = 0.0, k = 0; k < n; k++) 131 { 132 if (k != part) 133 { 134 f_part_k[X] = pos[part][X] - pos[k][X]; 135 f_part_k[Y] = pos[part][Y] - pos[k][Y]; 136 len = sqrt(f_part_k[X] * f_part_k[X] + f_part_k[Y] * f_part_k[Y]); 137 fact = -G * masses[part] * masses[k] / (len * len * len); 138 f_part_k[X] *= fact; 139 f_part_k[Y] *= fact; 140 loc_forces[loc_part][X] += f_part_k[X]; 141 loc_forces[loc_part][Y] += f_part_k[Y]; 142 # ifdef DEBUG// 確認計算結果 143 printf("Compute_force, rank%2d k%2d> %10.3e %10.3e %10.3e %10.3e\n", my_rank, k, len, fact, f_part_k[X], f_part_k[Y]); 144 fflush(stdout);// 函數 printf() 中引用 vel 會致使非 0 號進程中斷退出,吃了大虧 145 # endif 146 } 147 } 148 } 149 150 void Update_part(int loc_part, double masses[], vect_t loc_forces[], vect_t loc_pos[], vect_t loc_vel[], int n, int loc_n, double delta_t)// 更新顆粒位置 151 { 152 const int part = my_rank * loc_n + loc_part; 153 const double fact = delta_t / masses[part]; 154 # ifdef DEBUG// 輸出在更新前和更新後的數據 155 printf("Update_part before, part%2d %10.3e %10.3e %10.3e %10.3e %10.3e %10.3e\n", 156 part, loc_pos[loc_part][X], loc_pos[loc_part][Y], loc_vel[loc_part][X], loc_vel[loc_part][Y], loc_forces[loc_part][X], loc_forces[loc_part][Y]); 157 fflush(stdout); 158 # endif 159 loc_pos[loc_part][X] += delta_t * loc_vel[loc_part][X]; 160 loc_pos[loc_part][Y] += delta_t * loc_vel[loc_part][Y]; 161 loc_vel[loc_part][X] += fact * loc_forces[loc_part][X]; 162 loc_vel[loc_part][Y] += fact * loc_forces[loc_part][Y]; 163 # ifdef DEBUG 164 printf("Update_part after, part%2d %10.3e %10.3e %10.3e %10.3e\n", 165 part, loc_pos[loc_part][X], loc_pos[loc_part][Y], loc_vel[loc_part][X], loc_vel[loc_part][Y]); 166 fflush(stdout); 167 # endif 168 } 169 170 int main(int argc, char* argv[]) 171 { 172 int n, loc_n, loc_part; // 顆粒數,每進程顆粒數,當前顆粒(循環變量) 173 int n_steps, step; // 計算時間步數,當前時間片(循環變量) 174 double delta_t; // 計算時間步長 175 int output_freq; // 數據輸出頻率 176 double *masses; // 顆粒質量,每一個進程都有,一經初始化和同步就再也不改變 177 vect_t *pos, *loc_pos; // 顆粒位置,每一個時間片計算完成後須要同步 178 vect_t *loc_vel; // 顆粒速度,由各進程分開保存,不到輸出時不用同步 179 vect_t *loc_forces; // 各進程的顆粒所受引力 180 char g_i; // 初始條件選項,g 爲自動生成,i 爲手工輸入 181 double start, finish; // 計時器 182 183 MPI_Init(&argc, &argv); 184 MPI_Comm_size(MPI_COMM_WORLD, &comm_sz); 185 MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); 186 MPI_Type_contiguous(DIM, MPI_DOUBLE, &vect_mpi_t);// 提交須要的派生類型 187 MPI_Type_commit(&vect_mpi_t); 188 189 // 獲取參數,初始化數組 190 Get_args(argc, argv, &n, &n_steps, &delta_t, &output_freq, &g_i); 191 loc_n = n / comm_sz; // 要求 n % comm_sz == 0 192 masses = (double*)malloc(n * sizeof(double)); 193 pos = (vect_t*)malloc(n * sizeof(vect_t)); 194 loc_pos = pos + my_rank * loc_n; 195 loc_vel = (vect_t*)malloc(loc_n * sizeof(vect_t)); 196 loc_forces = (vect_t*)malloc(loc_n * sizeof(vect_t)); 197 if (my_rank == 0) 198 vel = (vect_t*)malloc(n * sizeof(vect_t)); 199 if (g_i == 'g') 200 Gen_init_cond(masses, pos, loc_vel, n, loc_n); 201 else 202 Get_init_cond(masses, pos, loc_vel, n, loc_n); 203 204 // 開始計算並計時 205 if (my_rank == 0) 206 start = MPI_Wtime(); 207 # ifdef OUTPUT// 輸出出初始態 208 Output_state(0.0, masses, pos, loc_vel, n, loc_n); 209 # endif 210 for (step = 1; step <= n_steps; step++) 211 { 212 // 計算每顆粒受力,更新顆粒狀態,而後同步顆粒位置 213 for (loc_part = 0; loc_part < loc_n; Compute_force(loc_part++, masses, loc_forces, pos, n, loc_n)); 214 for (loc_part = 0; loc_part < loc_n; Update_part(loc_part++, masses, loc_forces, loc_pos, loc_vel, n, loc_n, delta_t)); 215 MPI_Allgather(MPI_IN_PLACE, loc_n, vect_mpi_t, pos, loc_n, vect_mpi_t, MPI_COMM_WORLD); 216 # ifdef OUTPUT// 每隔一個輸出時間間隔就輸出一次 217 if (step % output_freq == 0) 218 Output_state(step * delta_t, masses, pos, loc_vel, n, loc_n); 219 # endif 220 } 221 // 報告計時,釋放資源 222 if (my_rank == 0) 223 { 224 finish = MPI_Wtime(); 225 printf("Elapsed time = %e ms\n", (finish - start) * 1000); 226 free(vel); 227 } 228 MPI_Type_free(&vect_mpi_t); 229 free(masses); 230 free(pos); 231 free(loc_vel); 232 free(loc_forces); 233 MPI_Finalize(); 234 return 0; 235 }
● 輸出結果。8 進程 16 體,3 秒,時間步長 1 秒,捨去 debug 輸出 1.264884e+00 ms;8 進程 1024 體,3600 秒,時間步長 1 秒,捨去 output 和 debug 輸出 1.328894e+04 msdom
D:\Code\MPI\MPIProjectTemp\x64\Debug>mpiexec -n 8 MPIProjectTemp.exe 16 3 1 1 g Output_state, time = 0.00 0 5.000e+24 0.000e+00 0.000e+00 0.000e+00 3.000e+04 1 5.000e+24 1.000e+05 0.000e+00 0.000e+00 -3.000e+04 2 5.000e+24 2.000e+05 0.000e+00 0.000e+00 3.000e+04 3 5.000e+24 3.000e+05 0.000e+00 0.000e+00 -3.000e+04 4 5.000e+24 4.000e+05 0.000e+00 0.000e+00 3.000e+04 5 5.000e+24 5.000e+05 0.000e+00 0.000e+00 -3.000e+04 6 5.000e+24 6.000e+05 0.000e+00 0.000e+00 3.000e+04 7 5.000e+24 7.000e+05 0.000e+00 0.000e+00 -3.000e+04 8 5.000e+24 8.000e+05 0.000e+00 0.000e+00 3.000e+04 9 5.000e+24 9.000e+05 0.000e+00 0.000e+00 -3.000e+04 10 5.000e+24 1.000e+06 0.000e+00 0.000e+00 3.000e+04 11 5.000e+24 1.100e+06 0.000e+00 0.000e+00 -3.000e+04 12 5.000e+24 1.200e+06 0.000e+00 0.000e+00 3.000e+04 13 5.000e+24 1.300e+06 0.000e+00 0.000e+00 -3.000e+04 14 5.000e+24 1.400e+06 0.000e+00 0.000e+00 3.000e+04 15 5.000e+24 1.500e+06 0.000e+00 0.000e+00 -3.000e+04 Output_state, time = 1.00 0 5.000e+24 0.000e+00 3.000e+04 5.273e+04 3.000e+04 1 5.000e+24 1.000e+05 -3.000e+04 1.922e+04 -3.000e+04 2 5.000e+24 2.000e+05 3.000e+04 1.071e+04 3.000e+04 3 5.000e+24 3.000e+05 -3.000e+04 6.802e+03 -3.000e+04 4 5.000e+24 4.000e+05 3.000e+04 4.485e+03 3.000e+04 5 5.000e+24 5.000e+05 -3.000e+04 2.875e+03 -3.000e+04 6 5.000e+24 6.000e+05 3.000e+04 1.614e+03 3.000e+04 7 5.000e+24 7.000e+05 -3.000e+04 5.213e+02 -3.000e+04 8 5.000e+24 8.000e+05 3.000e+04 -5.213e+02 3.000e+04 9 5.000e+24 9.000e+05 -3.000e+04 -1.614e+03 -3.000e+04 10 5.000e+24 1.000e+06 3.000e+04 -2.875e+03 3.000e+04 11 5.000e+24 1.100e+06 -3.000e+04 -4.485e+03 -3.000e+04 12 5.000e+24 1.200e+06 3.000e+04 -6.802e+03 3.000e+04 13 5.000e+24 1.300e+06 -3.000e+04 -1.071e+04 -3.000e+04 14 5.000e+24 1.400e+06 3.000e+04 -1.922e+04 3.000e+04 15 5.000e+24 1.500e+06 -3.000e+04 -5.273e+04 -3.000e+04 Output_state, time = 2.00 0 5.000e+24 5.273e+04 6.000e+04 9.288e+04 1.641e+04 1 5.000e+24 1.192e+05 -6.000e+04 3.818e+04 -3.791e+03 2 5.000e+24 2.107e+05 6.000e+04 2.116e+04 3.791e+03 3 5.000e+24 3.068e+05 -6.000e+04 1.356e+04 -3.101e+03 4 5.000e+24 4.045e+05 6.000e+04 8.930e+03 3.101e+03 5 5.000e+24 5.029e+05 -6.000e+04 5.739e+03 -2.959e+03 6 5.000e+24 6.016e+05 6.000e+04 3.218e+03 2.959e+03 7 5.000e+24 7.005e+05 -6.000e+04 1.043e+03 -2.929e+03 8 5.000e+24 7.995e+05 6.000e+04 -1.043e+03 2.929e+03 9 5.000e+24 8.984e+05 -6.000e+04 -3.218e+03 -2.959e+03 10 5.000e+24 9.971e+05 6.000e+04 -5.739e+03 2.959e+03 11 5.000e+24 1.096e+06 -6.000e+04 -8.930e+03 -3.101e+03 12 5.000e+24 1.193e+06 6.000e+04 -1.356e+04 3.101e+03 13 5.000e+24 1.289e+06 -6.000e+04 -2.116e+04 -3.791e+03 14 5.000e+24 1.381e+06 6.000e+04 -3.818e+04 3.791e+03 15 5.000e+24 1.447e+06 -6.000e+04 -9.288e+04 -1.641e+04 Output_state, time = 3.00 0 5.000e+24 1.456e+05 7.641e+04 1.273e+05 -1.575e+03 1 5.000e+24 1.574e+05 -6.379e+04 5.867e+04 2.528e+04 2 5.000e+24 2.319e+05 6.379e+04 2.685e+04 -2.069e+04 3 5.000e+24 3.204e+05 -6.310e+04 1.884e+04 2.228e+04 4 5.000e+24 4.134e+05 6.310e+04 1.235e+04 -2.151e+04 5 5.000e+24 5.086e+05 -6.296e+04 8.111e+03 2.179e+04 6 5.000e+24 6.048e+05 6.296e+04 4.537e+03 -2.163e+04 7 5.000e+24 7.016e+05 -6.293e+04 1.493e+03 2.169e+04 8 5.000e+24 7.984e+05 6.293e+04 -1.493e+03 -2.169e+04 9 5.000e+24 8.952e+05 -6.296e+04 -4.537e+03 2.163e+04 10 5.000e+24 9.914e+05 6.296e+04 -8.111e+03 -2.179e+04 11 5.000e+24 1.087e+06 -6.310e+04 -1.235e+04 2.151e+04 12 5.000e+24 1.180e+06 6.310e+04 -1.884e+04 -2.228e+04 13 5.000e+24 1.268e+06 -6.379e+04 -2.685e+04 2.069e+04 14 5.000e+24 1.343e+06 6.379e+04 -5.867e+04 -2.528e+04 15 5.000e+24 1.354e+06 -7.641e+04 -1.273e+05 1.575e+03 Elapsed time = 1.264884e+00 ms
● 簡化算法函數
1 // mpi_nbody_red.c,MPI 簡化算法,與 mppi_nbody_basic.c 相同的地方去掉了註釋 2 #include <stdio.h> 3 #include <stdlib.h> 4 #include <string.h> 5 #include <math.h> 6 #include <mpi.h> 7 8 #define OUTPUT 9 #define DEBUG 10 #define DIM 2 11 #define X 0 12 #define Y 1 13 typedef double vect_t[DIM]; 14 const double G = 6.673e-11; 15 int my_rank, comm_sz; 16 MPI_Datatype vect_mpi_t; 17 MPI_Datatype cyclic_mpi_t; // 環形傳輸的數據類型 18 vect_t *vel = NULL; 19 vect_t *pos = NULL; // 位置信息變成了全局變量 20 21 void Usage(char* prog_name) 22 { 23 fprintf(stderr, "usage: mpiexec -n <nProcesses> %s\n", prog_name); 24 fprintf(stderr, "<nParticle> <nTimestep> <sizeTimestep> <outputFrequency> <g|i>\n"); 25 fprintf(stderr, " 'g': inite condition by random\n"); 26 fprintf(stderr, " 'i': inite condition from stdin\n"); 27 exit(0); 28 } 29 30 void Get_args(int argc, char* argv[], int* n_p, int* n_steps_p, double* delta_t_p, int* output_freq_p, char* g_i_p) 31 { 32 if (my_rank == 0) 33 { 34 if (argc != 6) 35 Usage(argv[0]); 36 *n_p = strtol(argv[1], NULL, 10); 37 *n_steps_p = strtol(argv[2], NULL, 10); 38 *delta_t_p = strtod(argv[3], NULL); 39 *output_freq_p = strtol(argv[4], NULL, 10); 40 *g_i_p = argv[5][0]; 41 if (*n_p <= 0 || *n_p % comm_sz || *n_steps_p < 0 || *delta_t_p <= 0 || *g_i_p != 'g' && *g_i_p != 'i') 42 { 43 printf("haha\n"); 44 if (my_rank == 0) 45 Usage(argv[0]); 46 MPI_Finalize(); 47 exit(0); 48 } 49 } 50 MPI_Bcast(n_p, 1, MPI_INT, 0, MPI_COMM_WORLD); 51 MPI_Bcast(n_steps_p, 1, MPI_INT, 0, MPI_COMM_WORLD); 52 MPI_Bcast(delta_t_p, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); 53 MPI_Bcast(output_freq_p, 1, MPI_INT, 0, MPI_COMM_WORLD); 54 MPI_Bcast(g_i_p, 1, MPI_CHAR, 0, MPI_COMM_WORLD); 55 # ifdef DEBUG 56 printf("Get_args, rank%2d n %d n_steps %d delta_t %e output_freq %d g_i %c\n", 57 my_rank, *n_p, *n_steps_p, *delta_t_p, *output_freq_p, *g_i_p); 58 fflush(stdout); 59 # endif 60 } 61 62 void Build_cyclic_mpi_type(int loc_n)// 生成大小爲 loc_n 的循環分配數據類型 63 { 64 MPI_Datatype temp_mpi_t; 65 MPI_Aint lb, extent; 66 MPI_Type_vector(loc_n, 1, comm_sz, vect_mpi_t, &temp_mpi_t);// 將跨度爲 comm_sz 的 loc_n 個 「連續 2 個 double」 封裝爲 temp_mpi_t 67 MPI_Type_get_extent(vect_mpi_t, &lb, &extent); 68 MPI_Type_create_resized(temp_mpi_t, lb, extent, &cyclic_mpi_t); 69 MPI_Type_commit(&cyclic_mpi_t); 70 } 71 72 void Gen_init_cond(double masses[], vect_t loc_pos[], vect_t loc_vel[], int n, int loc_n) 73 { 74 const double mass = 5.0e24, gap = 1.0e5, speed = 3.0e4; 75 if (my_rank == 0) 76 { 77 // srand(2); 78 for (int i = 0; i < n; i++) 79 { 80 masses[i] = mass; 81 pos[i][X] = i * gap; 82 pos[i][Y] = 0.0; 83 vel[i][X] = 0.0; 84 // vel[i][Y] = speed * (2 * rand() / (double)RAND_MAX) - 1); 85 vel[i][Y] = (i % 2) ? -speed : speed; 86 } 87 } 88 MPI_Bcast(masses, n, MPI_DOUBLE, 0, MPI_COMM_WORLD); 89 MPI_Scatter(pos, 1, cyclic_mpi_t, loc_pos, loc_n, vect_mpi_t, 0, MPI_COMM_WORLD);// loc_pos 和 loc_vel 須要分別分發 90 MPI_Scatter(vel, 1, cyclic_mpi_t, loc_vel, loc_n, vect_mpi_t, 0, MPI_COMM_WORLD); 91 # ifdef DEBUG 92 printf("Gen_init_cond, rank%2d %10.3e %10.3e %10.3e %10.3e %10.3e\n", 93 my_rank, masses[0], loc_pos[0][X], loc_pos[0][Y], loc_vel[0][X], loc_vel[0][Y]); 94 fflush(stdout); 95 # endif 96 } 97 98 void Get_init_cond(double masses[], vect_t loc_pos[], vect_t loc_vel[], int n, int loc_n) 99 { 100 if (my_rank == 0) 101 { 102 printf("For each particle, enter (in order): mass x-coord y-coord x-velocity y-velocity\n"); 103 for (int i = 0; i < n; i++) 104 { 105 scanf_s("%lf", &masses[i]); 106 scanf_s("%lf", &pos[i][X]); 107 scanf_s("%lf", &pos[i][Y]); 108 scanf_s("%lf", &vel[i][X]); 109 scanf_s("%lf", &vel[i][Y]); 110 } 111 } 112 MPI_Bcast(masses, n, MPI_DOUBLE, 0, MPI_COMM_WORLD); 113 MPI_Scatter(pos, 1, cyclic_mpi_t, loc_pos, loc_n, vect_mpi_t, 0, MPI_COMM_WORLD);// loc_pos 和 loc_vel 須要分別分發 114 MPI_Scatter(vel, 1, cyclic_mpi_t, loc_vel, loc_n, vect_mpi_t, 0, MPI_COMM_WORLD); 115 # ifdef DEBUG 116 printf("Get_init_cond, rank%2d %10.3e %10.3e %10.3e %10.3e %10.3e\n", 117 my_rank, masses[0], loc_pos[0][X], loc_pos[0][Y], loc_vel[0][X], loc_vel[0][Y]); 118 fflush(stdout); 119 # endif 120 } 121 122 void Output_state(double time, double masses[], vect_t loc_pos[], vect_t loc_vel[], int n, int loc_n) 123 { 124 MPI_Gather(loc_pos, loc_n, vect_mpi_t, pos, 1, cyclic_mpi_t, 0, MPI_COMM_WORLD);// loc_pos 和 loc_vel 須要分別彙集 125 MPI_Gather(loc_vel, loc_n, vect_mpi_t, vel, 1, cyclic_mpi_t, 0, MPI_COMM_WORLD); 126 if (my_rank == 0) 127 { 128 printf("Output_state, time = %.2f\n", time); 129 for (int i = 0; i < n; i++) 130 printf(" %2d %10.3e %10.3e %10.3e %10.3e %10.3e\n", i, masses[i], pos[i][X], pos[i][Y], vel[i][X], vel[i][Y]); 131 printf("\n"); 132 fflush(stdout); 133 } 134 } 135 136 int First_index(int gbl1, int rk1, int rk2, int proc_count) 137 { 138 return gbl1 + (rk2 - rk1) + (rk1 < rk2 ? 0 : proc_count); 139 } 140 141 int Local_to_global(int loc_part, int proc_rk, int proc_count)// 進程局部編號轉全局編號 142 { 143 return loc_part * proc_count + proc_rk; 144 } 145 146 int Global_to_local(int gbl_part, int proc_rk, int proc_count)// 全局編號轉進程局部編號 147 { 148 return (gbl_part - proc_rk) / proc_count; 149 } 150 151 void Compute_force_pair(double m1, double m2, vect_t pos1, vect_t pos2, vect_t force1, vect_t force2)// 計算兩個顆粒之間的引力 152 { 153 const double mg = -G * m1 * m2; 154 vect_t f_part_k; 155 double len, fact; 156 157 f_part_k[X] = pos1[X] - pos2[X]; 158 f_part_k[Y] = pos1[Y] - pos2[Y]; 159 len = sqrt(f_part_k[X] * f_part_k[X] + f_part_k[Y] * f_part_k[Y]); 160 fact = mg / (len * len * len); 161 f_part_k[X] *= fact; 162 f_part_k[Y] *= fact; 163 force1[X] += f_part_k[X]; 164 force1[Y] += f_part_k[Y]; 165 force2[X] -= f_part_k[X]; 166 force2[Y] -= f_part_k[Y]; 167 } 168 169 void Compute_proc_forces(double masses[], vect_t tmp_data[], vect_t loc_forces[], vect_t pos1[], int loc_n1, int rk1, int loc_n2, int rk2, int n, int p) 170 { // 計算 pos1 與 tmp_data 中知足下標條件的顆粒之間的引力 171 //masses 顆粒質量表 172 //tmp_data 環形傳輸數據 173 //loc_forces 本進程顆粒受力數據 174 //pos1 本進程顆粒位置數據 175 //loc_n1 pos1 中顆粒數量 176 //rk1 pos1 中 process owning particles 177 //loc_n2 temp_data 中顆粒數量 178 int gbl_part1, gbl_part2, loc_part1, loc_part2; //rk2 tmp_data 中 process owning contributed particles 179 for (gbl_part1 = rk1, loc_part1 = 0; loc_part1 < loc_n1; loc_part1++, gbl_part1 += p) //n 總顆粒數 180 { //p 參與計算的進程數 181 for (gbl_part2 = First_index(gbl_part1, rk1, rk2, p), loc_part2 = Global_to_local(gbl_part2, rk2, p); loc_part2 < loc_n2; loc_part2++, gbl_part2 += p) 182 { 183 # ifdef DEBUG 184 printf("Compute_proc_forces before, rank%2d> part%2d %10.3e %10.3e part%2d %10.3e %10.3e\n", 185 my_rank, gbl_part1, loc_forces[loc_part1][X], loc_forces[loc_part1][Y], gbl_part2, tmp_data[loc_n2 + loc_part2][X], tmp_data[loc_n2 + loc_part2][Y]); 186 # endif 187 Compute_force_pair(masses[gbl_part1], masses[gbl_part2], pos1[loc_part1], tmp_data[loc_part2], loc_forces[loc_part1], tmp_data[loc_n2 + loc_part2]); 188 # ifdef DEBUG 189 printf("Compute_proc_forces before, rank%2d> part%2d %10.3e %10.3e part%2d %10.3e %10.3e\n", 190 my_rank, gbl_part1, loc_forces[loc_part1][X], loc_forces[loc_part1][Y], gbl_part2, tmp_data[loc_n2 + loc_part2][X], tmp_data[loc_n2 + loc_part2][Y]); 191 # endif 192 } 193 } 194 } 195 196 void Compute_forces(double masses[], vect_t tmp_data[], vect_t loc_forces[], vect_t loc_pos[], int n, int loc_n)// 計算本線程各顆粒受到的引力 197 { // masses 顆粒質量表 198 const int src = (my_rank + 1) % comm_sz, dest = (my_rank - 1 + comm_sz) % comm_sz; // tmp_data 環形傳輸數據 199 int i, other_proc, loc_part; // loc_forces 本進程顆粒受力數據 200 // loc_pos 本進程顆粒位置數據 201 memcpy(tmp_data, loc_pos, loc_n * sizeof(vect_t)); // 將本進程分到的位置數據放入 tmp_data // n 總顆粒數 202 memset(tmp_data + loc_n, 0, loc_n * sizeof(vect_t));// 初始化 tmp_data 的引力數據 // loc_n 本進程分到的顆粒數 203 memset(loc_forces, 0, loc_n * sizeof(vect_t)); // 初始化本進程引力數據 204 205 Compute_proc_forces(masses, tmp_data, loc_forces, loc_pos, loc_n, my_rank, loc_n, my_rank, n, comm_sz);// 計算本進程的顆粒間的引力做用 206 for (i = 1; i < comm_sz; i++)// 計算本進程的顆粒與收到的新顆粒之間的引力做用 207 { 208 other_proc = (my_rank + i) % comm_sz;// 每次交換信息的對象不一樣 209 MPI_Sendrecv_replace(tmp_data, 2 * loc_n, vect_mpi_t, dest, 0, src, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); 210 Compute_proc_forces(masses, tmp_data, loc_forces, loc_pos, loc_n, my_rank, loc_n, other_proc, n, comm_sz); 211 } 212 MPI_Sendrecv_replace(tmp_data, 2 * loc_n, vect_mpi_t, dest, 0, src, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); 213 for (loc_part = 0; loc_part < loc_n; loc_part++) 214 { 215 loc_forces[loc_part][X] += tmp_data[loc_n + loc_part][X]; 216 loc_forces[loc_part][Y] += tmp_data[loc_n + loc_part][Y]; 217 } 218 } 219 220 void Update_part(int loc_part, double masses[], vect_t loc_forces[], vect_t loc_pos[], vect_t loc_vel[], int n, int loc_n, double delta_t) 221 { 222 const int part = my_rank * loc_n + loc_part; 223 const double fact = delta_t / masses[part]; 224 # ifdef DEBUG// 輸出在更新前和更新後的數據 225 printf("Update_part before, part%2d %10.3e %10.3e %10.3e %10.3e %10.3e %10.3e\n", 226 part, loc_pos[loc_part][X], loc_pos[loc_part][Y], loc_vel[loc_part][X], loc_vel[loc_part][Y], loc_forces[loc_part][X], loc_forces[loc_part][Y]); 227 fflush(stdout); 228 # endif 229 loc_pos[loc_part][X] += delta_t * loc_vel[loc_part][X]; 230 loc_pos[loc_part][Y] += delta_t * loc_vel[loc_part][Y]; 231 loc_vel[loc_part][X] += fact * loc_forces[loc_part][X]; 232 loc_vel[loc_part][Y] += fact * loc_forces[loc_part][Y]; 233 # ifdef DEBUG 234 printf("Update_part after, part%2d %10.3e %10.3e %10.3e %10.3e\n", 235 part, loc_pos[loc_part][X], loc_pos[loc_part][Y], loc_vel[loc_part][X], loc_vel[loc_part][Y]); 236 fflush(stdout); 237 # endif 238 } 239 240 241 int main(int argc, char* argv[]) 242 { 243 int n, loc_n, loc_part; 244 int n_steps, step; 245 double delta_t; 246 int output_freq; 247 double *masses; 248 vect_t* loc_pos; // *pos 變成了全局變量 249 vect_t* tmp_data; // 用於環形交換的數據 250 vect_t* loc_vel; 251 vect_t* loc_forces; 252 char g_i; 253 double start, finish; 254 255 MPI_Init(&argc, &argv); 256 MPI_Comm_size(MPI_COMM_WORLD, &comm_sz); 257 MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); 258 MPI_Type_contiguous(DIM, MPI_DOUBLE, &vect_mpi_t); 259 MPI_Type_commit(&vect_mpi_t); 260 261 Get_args(argc, argv, &n, &n_steps, &delta_t, &output_freq, &g_i); 262 loc_n = n / comm_sz; 263 Build_cyclic_mpi_type(loc_n); 264 masses = (double*)malloc(n * sizeof(double)); 265 tmp_data = (vect_t*)malloc(2 * loc_n * sizeof(vect_t)); // 前半段 tmp_pos 後半段 temp_force 266 loc_pos = (vect_t*)malloc(loc_n * sizeof(vect_t)); 267 loc_vel = (vect_t*)malloc(loc_n * sizeof(vect_t)); 268 loc_forces = (vect_t*)malloc(loc_n * sizeof(vect_t)); 269 if (my_rank == 0) 270 { 271 pos = (vect_t*)malloc(n * sizeof(vect_t)); 272 vel = (vect_t*)malloc(n * sizeof(vect_t)); 273 } 274 if (g_i == 'g') 275 Gen_init_cond(masses, loc_pos, loc_vel, n, loc_n); 276 else 277 Get_init_cond(masses, loc_pos, loc_vel, n, loc_n); 278 279 if(my_rank == 0) 280 start = MPI_Wtime(); 281 # ifdef OUTPUT 282 Output_state(0.0, masses, loc_pos, loc_vel, n, loc_n); 283 # endif 284 for (step = 1; step <= n_steps; step++) 285 { 286 Compute_forces(masses, tmp_data, loc_forces, loc_pos, n, loc_n); 287 for (loc_part = 0; loc_part < loc_n; Update_part(loc_part++, masses, loc_forces, loc_pos, loc_vel, n, loc_n, delta_t)); 288 # ifdef OUTPUT 289 if (step % output_freq == 0) 290 Output_state(step * delta_t, masses, loc_pos, loc_vel, n, loc_n); 291 # endif 292 } 293 294 if (my_rank == 0) 295 { 296 finish = MPI_Wtime(); 297 printf("Elapsed time = %e ms\n", (finish - start) * 1000); 298 free(pos); 299 free(vel); 300 } 301 MPI_Type_free(&vect_mpi_t); 302 MPI_Type_free(&cyclic_mpi_t); 303 free(masses); 304 free(tmp_data); 305 free(loc_forces); 306 free(loc_pos); 307 free(loc_vel); 308 MPI_Finalize(); 309 return 0; 310 }
● 輸出結果,與基本算法相似。8 進程 16 體,3 秒,時間步長 1 秒,捨去 debug 輸出 1.476754e+00 ms;8 進程 1024 體,3600 秒,時間步長 1 秒,捨去 output 和 debug 輸出 1.329172e+04 ms,在較大數據規模上稍有優點大數據