OpenMP - 并行代码给出与顺序代码不同的结果
OpenMP - Parallel code give different result from sequential one
我在 openmp 上遇到了一些问题。我已经编写了一些计算代码并使用 openmp 并行化代码。但是顺序和并行给了我不同的结果。
这是代码
for(i=0; i<grid_number; i++)
{
double norm = 0;
const double alpha = gsl_vector_get(valpha, i);
for(j=0; j<n_sim; j++)
{
gsl_matrix_complex *sub_data = gsl_matrix_complex_calloc(n_obs, 1);
struct cmatrix cm;
cm.H0 = gsl_matrix_complex_calloc(n_obs-1, nWeight);
cm.optMatrix = gsl_matrix_complex_calloc(n_obs-1, n_obs-1);
for(k=0; k<3; k++)
{
gsl_vector_set(sub_b02, k, gsl_matrix_get(b02, j, k));
}
for(k=0; k<n_obs; k++)
{
const gsl_complex z = gsl_complex_rect(gsl_matrix_get(data2, k, j), 0);
gsl_matrix_complex_set(sub_data, k, 0, z);
}
gsl_vector* theta = gsl_vector_calloc(3);
c_matrix(sub_b02, sub_data, 1, cm, alpha);
fminsearch(sub_b02, sub_data, cm.optMatrix, cm.H0, theta);
gsl_vector_sub(theta, theta1);
norm += gsl_blas_dnrm2(theta);
gsl_matrix_free(sub_data);
gsl_matrix_free(cm.H0);
gsl_matrix_free(cm.optMatrix);
gsl_vector_free(theta);
}
double mse = total_weight * norm /(double)n_sim;
printf("alpha:%f, MSE:%.12e\n", alpha, mse);
mses[i] = mse;
alphas[i] = alpha;
}
运行这段代码,给出这个结果:
alpha:0.000010, MSE:1.368646778831e-01
alpha:0.000076, MSE:1.368646778831e-01
alpha:0.000142, MSE:1.368646778831e-01
alpha:0.000208, MSE:1.368646778831e-01
alpha:0.000274, MSE:1.368646778831e-01
alpha:0.000340, MSE:1.368646778831e-01
alpha:0.000406, MSE:1.368646778831e-01
alpha:0.000472, MSE:1.368646778831e-01
alpha:0.000538, MSE:1.368646778831e-01
alpha:0.000604, MSE:1.368646778831e-01
alpha:0.000670, MSE:1.368646778831e-01
alpha:0.000736, MSE:1.368646778831e-01
alpha:0.000802, MSE:1.368646778831e-01
alpha:0.000868, MSE:1.368646778831e-01
alpha:0.000934, MSE:1.368646778831e-01
然后我尝试使用 open mp 并行化代码:
#pragma omp parallel for private(j,k)
for(i=0; i<grid_number; i++)
{
double norm = 0;
const double alpha = gsl_vector_get(valpha, i);
for(j=0; j<n_sim; j++)
{
gsl_matrix_complex *sub_data = gsl_matrix_complex_calloc(n_obs, 1);
struct cmatrix cm;
cm.H0 = gsl_matrix_complex_calloc(n_obs-1, nWeight);
cm.optMatrix = gsl_matrix_complex_calloc(n_obs-1, n_obs-1);
for(k=0; k<3; k++)
{
gsl_vector_set(sub_b02, k, gsl_matrix_get(b02, j, k));
}
for(k=0; k<n_obs; k++)
{
const gsl_complex z = gsl_complex_rect(gsl_matrix_get(data2, k, j), 0);
gsl_matrix_complex_set(sub_data, k, 0, z);
}
gsl_vector* theta = gsl_vector_calloc(3);
c_matrix(sub_b02, sub_data, 1, cm, alpha);
fminsearch(sub_b02, sub_data, cm.optMatrix, cm.H0, theta);
gsl_vector_sub(theta, theta1);
norm += gsl_blas_dnrm2(theta);
gsl_matrix_free(sub_data);
gsl_matrix_free(cm.H0);
gsl_matrix_free(cm.optMatrix);
gsl_vector_free(theta);
}
double mse = total_weight * norm /(double)n_sim;
printf("alpha:%f, MSE:%.12e\n", alpha, mse);
mses[i] = mse;
alphas[i] = alpha;
}
并行结果:
alpha:0.000934, MSE:1.368646778831e-01
alpha:0.000802, MSE:1.368646778831e-01
alpha:0.000274, MSE:1.368646778831e-01
alpha:0.000670, MSE:1.368646778831e-01
alpha:0.000010, MSE:1.368646778831e-01
alpha:0.000538, MSE:1.368646778831e-01
alpha:0.000406, MSE:1.368646778831e-01
alpha:0.000142, MSE:1.368646778831e-01
alpha:0.000736, MSE:1.368646778831e-01
alpha:0.000604, MSE:1.368646778831e-01
alpha:0.000208, MSE:1.368388509959e-01
alpha:0.000340, MSE:1.368646778831e-01
alpha:0.000472, MSE:1.369194416804e-01
alpha:0.000868, MSE:1.368691005950e-01
alpha:0.000076, MSE:1.369461873652e-01
为什么两个结果在某些 alpha 上不同?
程序的顺序版本和并行版本之间的不同结果实际上总是意味着一件事:竞争条件。在您的情况下,很难查明原因,因为您没有提供最小的工作示例(真可耻)。
但是我已经成功地对一些缺失的内容进行了逆向工程,我会声称您的问题出在变量 sub_b02
上。它是在并行块之外定义的,这使得它在默认情况下是共享的,但是你在它上面调用 gsl_vector_set
,这使得不同的线程写入相同的相同内存位置。由于它是一个指针,您可能必须在 parallel
块内分配它。
我不能说没有更多错误,尤其是因为我看不到 c_matrix
和 fminsearch
。但是你应该做的是花点时间考虑哪些变量应该 shared/private 到线程,然后将 default(none)
添加到 pragma 并显式写出 shared/private
东西。这应该能让您更好地了解自己遗漏了什么。
我在 openmp 上遇到了一些问题。我已经编写了一些计算代码并使用 openmp 并行化代码。但是顺序和并行给了我不同的结果。
这是代码
for(i=0; i<grid_number; i++)
{
double norm = 0;
const double alpha = gsl_vector_get(valpha, i);
for(j=0; j<n_sim; j++)
{
gsl_matrix_complex *sub_data = gsl_matrix_complex_calloc(n_obs, 1);
struct cmatrix cm;
cm.H0 = gsl_matrix_complex_calloc(n_obs-1, nWeight);
cm.optMatrix = gsl_matrix_complex_calloc(n_obs-1, n_obs-1);
for(k=0; k<3; k++)
{
gsl_vector_set(sub_b02, k, gsl_matrix_get(b02, j, k));
}
for(k=0; k<n_obs; k++)
{
const gsl_complex z = gsl_complex_rect(gsl_matrix_get(data2, k, j), 0);
gsl_matrix_complex_set(sub_data, k, 0, z);
}
gsl_vector* theta = gsl_vector_calloc(3);
c_matrix(sub_b02, sub_data, 1, cm, alpha);
fminsearch(sub_b02, sub_data, cm.optMatrix, cm.H0, theta);
gsl_vector_sub(theta, theta1);
norm += gsl_blas_dnrm2(theta);
gsl_matrix_free(sub_data);
gsl_matrix_free(cm.H0);
gsl_matrix_free(cm.optMatrix);
gsl_vector_free(theta);
}
double mse = total_weight * norm /(double)n_sim;
printf("alpha:%f, MSE:%.12e\n", alpha, mse);
mses[i] = mse;
alphas[i] = alpha;
}
运行这段代码,给出这个结果:
alpha:0.000010, MSE:1.368646778831e-01
alpha:0.000076, MSE:1.368646778831e-01
alpha:0.000142, MSE:1.368646778831e-01
alpha:0.000208, MSE:1.368646778831e-01
alpha:0.000274, MSE:1.368646778831e-01
alpha:0.000340, MSE:1.368646778831e-01
alpha:0.000406, MSE:1.368646778831e-01
alpha:0.000472, MSE:1.368646778831e-01
alpha:0.000538, MSE:1.368646778831e-01
alpha:0.000604, MSE:1.368646778831e-01
alpha:0.000670, MSE:1.368646778831e-01
alpha:0.000736, MSE:1.368646778831e-01
alpha:0.000802, MSE:1.368646778831e-01
alpha:0.000868, MSE:1.368646778831e-01
alpha:0.000934, MSE:1.368646778831e-01
然后我尝试使用 open mp 并行化代码:
#pragma omp parallel for private(j,k)
for(i=0; i<grid_number; i++)
{
double norm = 0;
const double alpha = gsl_vector_get(valpha, i);
for(j=0; j<n_sim; j++)
{
gsl_matrix_complex *sub_data = gsl_matrix_complex_calloc(n_obs, 1);
struct cmatrix cm;
cm.H0 = gsl_matrix_complex_calloc(n_obs-1, nWeight);
cm.optMatrix = gsl_matrix_complex_calloc(n_obs-1, n_obs-1);
for(k=0; k<3; k++)
{
gsl_vector_set(sub_b02, k, gsl_matrix_get(b02, j, k));
}
for(k=0; k<n_obs; k++)
{
const gsl_complex z = gsl_complex_rect(gsl_matrix_get(data2, k, j), 0);
gsl_matrix_complex_set(sub_data, k, 0, z);
}
gsl_vector* theta = gsl_vector_calloc(3);
c_matrix(sub_b02, sub_data, 1, cm, alpha);
fminsearch(sub_b02, sub_data, cm.optMatrix, cm.H0, theta);
gsl_vector_sub(theta, theta1);
norm += gsl_blas_dnrm2(theta);
gsl_matrix_free(sub_data);
gsl_matrix_free(cm.H0);
gsl_matrix_free(cm.optMatrix);
gsl_vector_free(theta);
}
double mse = total_weight * norm /(double)n_sim;
printf("alpha:%f, MSE:%.12e\n", alpha, mse);
mses[i] = mse;
alphas[i] = alpha;
}
并行结果:
alpha:0.000934, MSE:1.368646778831e-01
alpha:0.000802, MSE:1.368646778831e-01
alpha:0.000274, MSE:1.368646778831e-01
alpha:0.000670, MSE:1.368646778831e-01
alpha:0.000010, MSE:1.368646778831e-01
alpha:0.000538, MSE:1.368646778831e-01
alpha:0.000406, MSE:1.368646778831e-01
alpha:0.000142, MSE:1.368646778831e-01
alpha:0.000736, MSE:1.368646778831e-01
alpha:0.000604, MSE:1.368646778831e-01
alpha:0.000208, MSE:1.368388509959e-01
alpha:0.000340, MSE:1.368646778831e-01
alpha:0.000472, MSE:1.369194416804e-01
alpha:0.000868, MSE:1.368691005950e-01
alpha:0.000076, MSE:1.369461873652e-01
为什么两个结果在某些 alpha 上不同?
程序的顺序版本和并行版本之间的不同结果实际上总是意味着一件事:竞争条件。在您的情况下,很难查明原因,因为您没有提供最小的工作示例(真可耻)。
但是我已经成功地对一些缺失的内容进行了逆向工程,我会声称您的问题出在变量 sub_b02
上。它是在并行块之外定义的,这使得它在默认情况下是共享的,但是你在它上面调用 gsl_vector_set
,这使得不同的线程写入相同的相同内存位置。由于它是一个指针,您可能必须在 parallel
块内分配它。
我不能说没有更多错误,尤其是因为我看不到 c_matrix
和 fminsearch
。但是你应该做的是花点时间考虑哪些变量应该 shared/private 到线程,然后将 default(none)
添加到 pragma 并显式写出 shared/private
东西。这应该能让您更好地了解自己遗漏了什么。