多线程执行时间与随机数之和
Multithreading execution time with sum of random numbers
我正在尝试创建一个多线程程序,该程序将 N 个随机数 [-100,100] 的数组与 K 个工作线程相加,这些工作线程由程序员实现的自旋锁(忙等待)序列化。在尝试使用随机数之前,出于测试目的,我将整个数组初始化为 1,正如您将在我的代码中看到的那样。由于我对问题出在哪里一无所知,我将 post 完整代码:
#include <iostream>
#include <string.h>
#include <pthread.h>
#include <cstdlib>
#include <time.h>
#include <atomic>
#include <chrono>
using namespace std;
using namespace chrono;
struct lock {
long double sum = 0;
atomic_flag m_flag = ATOMIC_FLAG_INIT; // Inicializa com m_flag = 0
void acquire() {
while(m_flag.test_and_set());
}
void release() {
m_flag.clear();
}
};
struct t_data{
int t_id;
char* sumArray;
struct lock* spinlock;
};
void* sum(void* thread_data) {
struct t_data *my_data;
long double m_sum=0;
my_data = (struct t_data *) thread_data;
for (int i=0;i<strlen(my_data->sumArray);i++) {
m_sum += my_data->sumArray[i];
}
my_data->spinlock->acquire();
cout << "THREAD ID: " << my_data->t_id << endl;
cout << "Acquired lock." << endl;
my_data->spinlock->sum += m_sum;
cout << "Releasing lock..." << endl << endl;
my_data->spinlock->release();
}
int main(int argc, char** argv) {
// Inicializar cronômetro, arrays, spinlock,etc. , spinlock, etc.
system_clock::time_point starting_time = system_clock::now();
int K = atoi(argv[1]);
int N = atoi(argv[2]);
int temp;
double expected_sum = 0;
pthread_t threads[K];
struct t_data threads_data[K];
struct lock spinlock;
const long int numElements = (long int) N/K; //Divisão inteira de N/K para dividir array em parcelas
// Criar array[K] de arrays para delegar cada sub-lista a uma thread
char** numArrays = new char*[K];
for(int i=0;i<K;i++)
numArrays[i] = new char[numElements]; //Char utilizado para que seja alocado apenas 1 byte por número
// Inicializar seed aleatória para preenchimento de arrays
srand(time(NULL));
//Preencher arrays que serão passados às threads criadas
for (int i=0;i<K;i++) {
for(int j=0;j<numElements;j++) {
temp = 1;//rand() % 201 - 100; (CHANGING THIS GIVES UNEXPECTED RESULTS)
numArrays[i][j] = temp;
expected_sum+=temp;
}
//Criar threads e passando argumentos(id,spinlock,array)
threads_data[i].t_id = i;
threads_data[i].spinlock = &spinlock;
threads_data[i].sumArray = numArrays[i];
pthread_create(&threads[i],NULL,sum,(void*)&threads_data[i]);
}
// Parar o programa até que todas as threads terminem para imprimir soma correta
for (int i=0;i<K;i++){
if(pthread_join(threads[i],NULL)) cout << "Error waiting for threads." << endl;
}
// Somando últimos valores restantes no caso de N%K != 0 (esta parcela torna-se irrelevante à medida que N >> K)
for(int i=0;i<(int)N%K;i++) {
temp = 1;//rand() % 201 - 100; (CHANGING THIS GIVES UNEXPECTED RESULTS)
spinlock.sum+=temp;
expected_sum+=temp;
}
// Printar resultado esperado, o calculado e tempo de execução
cout << "EXPECTED SUM = " << expected_sum << endl;
cout << "CALCULATED SUM = " << spinlock.sum << endl;
// Liberar memória alocada
for(int i=0;i<K;i++)
delete[] numArrays[i];
delete[] numArrays;
auto start_ms = time_point_cast<milliseconds>(starting_time);
auto now = system_clock::now();
auto now_ms = time_point_cast<milliseconds>(now);
auto value = now_ms - start_ms;
long execution_time = value.count();
cout << "-----------------------" << endl;
cout << "Execution time: " << execution_time << "ms" << endl;
return 0;
}
这在计算总和时效果很好,但对执行时间造成了问题:它应该与 (N/K) 线性缩放,但测试 K=10,N=10⁶:
EXPECTED SUM = 1e+06
CALCULATED SUM = 1e+06
-----------------------
Execution time: 1310ms
K=10, N=2*10⁶:
EXPECTED SUM = 2e+06
CALCULATED SUM = 2e+06
-----------------------
Execution time: 7144ms
我不知道为什么会这样。应该加倍。更改 K 正常工作。另外,如果我使用 rand() % 201-100
而不是 1,事情就会变得一团糟。对于 K=10,N=10⁶:
EXPECTED SUM = -16307
CALCULATED SUM = 1695
-----------------------
Execution time: 95ms
关于执行时间的变化,N 是固定的(线性缩放)但 K 不再有任何区别。 None 这些对我来说很有意义。
提前致谢!
strlen(my_data->sumArray)
将在字符数组/c 字符串中的第一个 0
处停止,同时您继续对 expected_sum
的 temp
值求和。对非 ascii 数据使用 vector
(毕竟这是 C++):
// use a vector in t_data
struct t_data{
int t_id;
std::vector<char> sumArray;
lock* spinlock;
};
// adjust summing up in sum(void* thread_data)
for (char value : my_data->sumArray) {
m_sum += value;
}
// initialise like this
threads_data[i].sumArray.resize(numElements);
for(size_t j = 0; j < threads_data[i].sumArray.size(); ++j) {
char temp = 1; //or (char)(rand() % 201 - 100);
threads_data[i].sumArray[j] = temp;
expected_sum += temp;
}
现在考虑一下您的计时:将 threads_data[i]
和 expected_sum
的初始化移到计时区域之外,否则数以百万计的 rand
调用肯定会主宰一切。在任何情况下,您都在测量顺序版本和并行版本,因此您不能指望 K
会在时间上有所不同:您总是至少测量顺序版本 + 最后一个并行版本(加入时)。
我正在尝试创建一个多线程程序,该程序将 N 个随机数 [-100,100] 的数组与 K 个工作线程相加,这些工作线程由程序员实现的自旋锁(忙等待)序列化。在尝试使用随机数之前,出于测试目的,我将整个数组初始化为 1,正如您将在我的代码中看到的那样。由于我对问题出在哪里一无所知,我将 post 完整代码:
#include <iostream>
#include <string.h>
#include <pthread.h>
#include <cstdlib>
#include <time.h>
#include <atomic>
#include <chrono>
using namespace std;
using namespace chrono;
struct lock {
long double sum = 0;
atomic_flag m_flag = ATOMIC_FLAG_INIT; // Inicializa com m_flag = 0
void acquire() {
while(m_flag.test_and_set());
}
void release() {
m_flag.clear();
}
};
struct t_data{
int t_id;
char* sumArray;
struct lock* spinlock;
};
void* sum(void* thread_data) {
struct t_data *my_data;
long double m_sum=0;
my_data = (struct t_data *) thread_data;
for (int i=0;i<strlen(my_data->sumArray);i++) {
m_sum += my_data->sumArray[i];
}
my_data->spinlock->acquire();
cout << "THREAD ID: " << my_data->t_id << endl;
cout << "Acquired lock." << endl;
my_data->spinlock->sum += m_sum;
cout << "Releasing lock..." << endl << endl;
my_data->spinlock->release();
}
int main(int argc, char** argv) {
// Inicializar cronômetro, arrays, spinlock,etc. , spinlock, etc.
system_clock::time_point starting_time = system_clock::now();
int K = atoi(argv[1]);
int N = atoi(argv[2]);
int temp;
double expected_sum = 0;
pthread_t threads[K];
struct t_data threads_data[K];
struct lock spinlock;
const long int numElements = (long int) N/K; //Divisão inteira de N/K para dividir array em parcelas
// Criar array[K] de arrays para delegar cada sub-lista a uma thread
char** numArrays = new char*[K];
for(int i=0;i<K;i++)
numArrays[i] = new char[numElements]; //Char utilizado para que seja alocado apenas 1 byte por número
// Inicializar seed aleatória para preenchimento de arrays
srand(time(NULL));
//Preencher arrays que serão passados às threads criadas
for (int i=0;i<K;i++) {
for(int j=0;j<numElements;j++) {
temp = 1;//rand() % 201 - 100; (CHANGING THIS GIVES UNEXPECTED RESULTS)
numArrays[i][j] = temp;
expected_sum+=temp;
}
//Criar threads e passando argumentos(id,spinlock,array)
threads_data[i].t_id = i;
threads_data[i].spinlock = &spinlock;
threads_data[i].sumArray = numArrays[i];
pthread_create(&threads[i],NULL,sum,(void*)&threads_data[i]);
}
// Parar o programa até que todas as threads terminem para imprimir soma correta
for (int i=0;i<K;i++){
if(pthread_join(threads[i],NULL)) cout << "Error waiting for threads." << endl;
}
// Somando últimos valores restantes no caso de N%K != 0 (esta parcela torna-se irrelevante à medida que N >> K)
for(int i=0;i<(int)N%K;i++) {
temp = 1;//rand() % 201 - 100; (CHANGING THIS GIVES UNEXPECTED RESULTS)
spinlock.sum+=temp;
expected_sum+=temp;
}
// Printar resultado esperado, o calculado e tempo de execução
cout << "EXPECTED SUM = " << expected_sum << endl;
cout << "CALCULATED SUM = " << spinlock.sum << endl;
// Liberar memória alocada
for(int i=0;i<K;i++)
delete[] numArrays[i];
delete[] numArrays;
auto start_ms = time_point_cast<milliseconds>(starting_time);
auto now = system_clock::now();
auto now_ms = time_point_cast<milliseconds>(now);
auto value = now_ms - start_ms;
long execution_time = value.count();
cout << "-----------------------" << endl;
cout << "Execution time: " << execution_time << "ms" << endl;
return 0;
}
这在计算总和时效果很好,但对执行时间造成了问题:它应该与 (N/K) 线性缩放,但测试 K=10,N=10⁶:
EXPECTED SUM = 1e+06
CALCULATED SUM = 1e+06
-----------------------
Execution time: 1310ms
K=10, N=2*10⁶:
EXPECTED SUM = 2e+06
CALCULATED SUM = 2e+06
-----------------------
Execution time: 7144ms
我不知道为什么会这样。应该加倍。更改 K 正常工作。另外,如果我使用 rand() % 201-100
而不是 1,事情就会变得一团糟。对于 K=10,N=10⁶:
EXPECTED SUM = -16307
CALCULATED SUM = 1695
-----------------------
Execution time: 95ms
关于执行时间的变化,N 是固定的(线性缩放)但 K 不再有任何区别。 None 这些对我来说很有意义。
提前致谢!
strlen(my_data->sumArray)
将在字符数组/c 字符串中的第一个 0
处停止,同时您继续对 expected_sum
的 temp
值求和。对非 ascii 数据使用 vector
(毕竟这是 C++):
// use a vector in t_data
struct t_data{
int t_id;
std::vector<char> sumArray;
lock* spinlock;
};
// adjust summing up in sum(void* thread_data)
for (char value : my_data->sumArray) {
m_sum += value;
}
// initialise like this
threads_data[i].sumArray.resize(numElements);
for(size_t j = 0; j < threads_data[i].sumArray.size(); ++j) {
char temp = 1; //or (char)(rand() % 201 - 100);
threads_data[i].sumArray[j] = temp;
expected_sum += temp;
}
现在考虑一下您的计时:将 threads_data[i]
和 expected_sum
的初始化移到计时区域之外,否则数以百万计的 rand
调用肯定会主宰一切。在任何情况下,您都在测量顺序版本和并行版本,因此您不能指望 K
会在时间上有所不同:您总是至少测量顺序版本 + 最后一个并行版本(加入时)。