选择锁粒度

Question

我正在测试几个基本互斥实现的性能，以尝试了解哪种解决方案最有效。操作很简单，5个整数的向量不断被4个线程覆盖，一个加法，一个减法，另外两个做同样的事情，但顺序相反。

虽然我在解释结果时遇到了一些问题：

整个工作负载的序列化单线程 3.3 秒

2.1 如果我为每个单独的向量槽使用一个锁（所以 5 个锁）

1.6 如果我只使用一个大锁来写入向量访问

0.33 如果我不使用锁（这自然会产生不好的结果）

单独的锁不应该比只使用一个大的锁更快吗？

根据要求：

#include <iostream>
#include <thread>
#include <string>
#include <chrono>
#include <mutex>
#include <vector>
#include <algorithm>

//Guard automatically encapsulates a join so that upper layer operations aren't compromised and the main programmer doesn't need to worry about joining on every exit

class Guard {
    std::thread& t;
public:
    explicit Guard(std::thread & t_) : t(t_) {};
    ~Guard() {
        if (t.joinable()) {
            t.join();                              //join waits so no need to check timers and such
        }
    }
    Guard(Guard const &) = delete;                 //prevents automatic reassignment
    Guard & operator = (Guard const &) = delete;   //prevents copying

};

void idfunc(int x, std::string input) {

    std::cout << input << x << std::hex << std::this_thread::get_id() << std::endl;
}

#define CONSTOP 1000000
#define SIZEBUFFER 5
bool s1, s2, s3, s4 = false;
float d1, d2, d3, d4 = 0.0f;
std::vector<std::mutex> mm(SIZEBUFFER);
std::mutex singlelock;

void fs_up(std::vector<int>& input) {

    auto start = std::chrono::system_clock::now();
    for (int i = 0; i < CONSTOP; i++) {
        singlelock.lock();
        input[i%SIZEBUFFER]++;
        singlelock.unlock();
    }
    s1 = true;
    auto end = std::chrono::system_clock::now();
    std::chrono::duration<double> diff = end - start;
    d1 = diff.count();
}


void fs_down(std::vector<int>& input) {

    auto start = std::chrono::system_clock::now();
    for (int i = 0; i < CONSTOP; i++) {
        singlelock.lock();
        input[i%SIZEBUFFER]--;
        singlelock.unlock();
    }
    s2 = true;
    auto end = std::chrono::system_clock::now();
    std::chrono::duration<double> diff = end - start;
    d2 = diff.count();
}

void fs_downright(std::vector<int>& input) {

    auto start = std::chrono::system_clock::now();
    for (int i = CONSTOP - 1; i >= 0; i--) {
        singlelock.lock();
        input[i%SIZEBUFFER]--;
        singlelock.unlock();
    }
    s3 = true;
    auto end = std::chrono::system_clock::now();
    std::chrono::duration<double> diff = end - start;
    d3 = diff.count();
}

void fs_upright(std::vector<int>& input) {

    auto start = std::chrono::system_clock::now();
    for (int i = CONSTOP - 1; i >= 0; i--) {
        singlelock.lock();
        input[i%SIZEBUFFER]++;
        singlelock.unlock();
    }
    s4 = true;
    auto end = std::chrono::system_clock::now();
    std::chrono::duration<double> diff = end - start;
    d4 = diff.count();
}

void f_upno(std::vector<int>& input) {

    auto start = std::chrono::system_clock::now();
    for (int i = 0; i < CONSTOP; i++) {
        input[i%SIZEBUFFER]++;
    }
    s1 = true;
    auto end = std::chrono::system_clock::now();
    std::chrono::duration<double> diff = end - start;
    d1 = diff.count();
}


void f_downno(std::vector<int>& input) {

    auto start = std::chrono::system_clock::now();
    for (int i = 0; i < CONSTOP; i++) {
        input[i%SIZEBUFFER]--;
    }
    s2 = true;
    auto end = std::chrono::system_clock::now();
    std::chrono::duration<double> diff = end - start;
    d2 = diff.count();
}

void f_downrightno(std::vector<int>& input) {

    auto start = std::chrono::system_clock::now();
    for (int i = CONSTOP - 1; i >= 0; i--) {
        input[i%SIZEBUFFER]--;
    }
    s3 = true;
    auto end = std::chrono::system_clock::now();
    std::chrono::duration<double> diff = end - start;
    d3 = diff.count();
}

void f_uprightno(std::vector<int>& input) {

    auto start = std::chrono::system_clock::now();
    for (int i = CONSTOP - 1; i >= 0; i--) {
        input[i%SIZEBUFFER]++;
    }
    s4 = true;
    auto end = std::chrono::system_clock::now();
    std::chrono::duration<double> diff = end - start;
    d4 = diff.count();
}



void f_up(std::vector<int>& input) {

    auto start = std::chrono::system_clock::now();
    for (int i = 0; i < CONSTOP; i++){
        mm[i%SIZEBUFFER].lock();
        input[i%SIZEBUFFER]++;
        mm[i%SIZEBUFFER].unlock();
    }
    s1 = true;
    auto end = std::chrono::system_clock::now();
    std::chrono::duration<double> diff = end - start;
    d1 = diff.count();
}


void f_down(std::vector<int>& input) {

    auto start = std::chrono::system_clock::now();
    for (int i = 0; i < CONSTOP; i++) {
        mm[i%SIZEBUFFER].lock();
        input[i%SIZEBUFFER]--;
        mm[i%SIZEBUFFER].unlock();
    }
    s2 = true;
    auto end = std::chrono::system_clock::now();
    std::chrono::duration<double> diff = end - start;
    d2 = diff.count();
}

void f_downright(std::vector<int>& input) {

    auto start = std::chrono::system_clock::now();
    for (int i = CONSTOP - 1; i >= 0; i--) {
        mm[i%SIZEBUFFER].lock();
        input[i%SIZEBUFFER]--;
        mm[i%SIZEBUFFER].unlock();
    }
    s3 = true;
    auto end = std::chrono::system_clock::now();
    std::chrono::duration<double> diff = end - start;
    d3 = diff.count();
}

void f_upright(std::vector<int>& input) {

    auto start = std::chrono::system_clock::now();
    for (int i = CONSTOP - 1 ; i >= 0; i--) {
        mm[i%SIZEBUFFER].lock();
        input[i%SIZEBUFFER]++;
        mm[i%SIZEBUFFER].unlock();
    }
    s4 = true;
    auto end = std::chrono::system_clock::now();
    std::chrono::duration<double> diff = end - start;
    d4 = diff.count();
}

int main()
{
    std::vector<int> buffer(SIZEBUFFER, 0);
    auto start = std::chrono::system_clock::now();

    f_up(buffer);
    f_down(buffer);
    f_downright(buffer);
    f_upright(buffer);



    auto end = std::chrono::system_clock::now();
    std::chrono::duration<double> diff = end - start;
    std::cout << "Benchmark is: " << diff.count() << std::endl;

    int num = std::thread::hardware_concurrency();

    /*for (int i = 0; i < num; i++) {
        std::thread t(idfunc, 0, "ThreadID is: ");
        Guard g(t);
        //code safe from here on out
    }*/
    std::thread t1(f_up, std::ref(buffer));
    Guard* g1  = new Guard(t1);
    std::thread t2(f_down, std::ref(buffer));
    Guard* g2 = new Guard(t2);
    std::thread t3(f_downright, std::ref(buffer));
    Guard* g3 = new Guard(t3);
    std::thread t4(f_upright, std::ref(buffer));
    Guard* g4 = new Guard(t4);


    while (true) {  
        break;          // to reuse if main thread is supposed to do something besides waiting
        std::this_thread::sleep_for(std::chrono::milliseconds(1));
        if (s1 && s2 && s3 && s4) {  
            break;
        }
    }


    delete g1;
    delete g2;
    delete g3;
    delete g4;

    std::cout << "Individual Locks Execution lasted: " << std::max({ d1,d2,d3,d4 }) << "(" << d1 << " " << d2 << " " << d3 << " " << d4 << ")" << std::endl;
    std::this_thread::sleep_for(std::chrono::seconds(1));
    for (auto cell : buffer) {
        std::cout << std::dec << cell << std::endl;
    }



    std::thread t11(fs_up, std::ref(buffer));
    Guard* g11 = new Guard(t11);
    std::thread t12(fs_down, std::ref(buffer));
    Guard* g12 = new Guard(t12);
    std::thread t13(fs_downright, std::ref(buffer));
    Guard* g13 = new Guard(t13);
    std::thread t14(fs_upright, std::ref(buffer));
    Guard* g14 = new Guard(t14);



    while (true) {
        break;          // to reuse if main thread is supposed to do something besides waiting
        std::this_thread::sleep_for(std::chrono::milliseconds(1));
        if (s1 && s2 && s3 && s4) {
            break;
        }
    }


    delete g11;
    delete g12;
    delete g13;
    delete g14;

    std::cout << "One Lock Execution lasted: " << std::max({ d1,d2,d3,d4 }) << "(" << d1 << " " << d2 << " " << d3 << " " << d4 << ")" << std::endl;

    std::this_thread::sleep_for(std::chrono::seconds(1));

    for (auto cell : buffer) {
        std::cout << std::dec << cell << std::endl;
    }


    std::thread tn1(f_upno, std::ref(buffer));
    Guard* gn1 = new Guard(tn1);
    std::thread tn2(f_downno, std::ref(buffer));
    Guard* gn2 = new Guard(tn2);
    std::thread tn3(f_downrightno, std::ref(buffer));
    Guard* gn3 = new Guard(tn3);
    std::thread tn4(f_uprightno, std::ref(buffer));
    Guard* gn4 = new Guard(tn4);



    while (true) {
        break;          // to reuse if main thread is supposed to do something besides waiting
        std::this_thread::sleep_for(std::chrono::milliseconds(1));
        if (s1 && s2 && s3 && s4) {
            break;
        }
    }


    delete gn1;
    delete gn2;
    delete gn3;
    delete gn4;

    std::cout << "No Sync Execution lasted: " << std::max({ d1,d2,d3,d4 }) << "(" << d1 << " " << d2 << " " << d3 << " " << d4 << ")" << std::endl;
    std::this_thread::sleep_for(std::chrono::seconds(1));
    for (auto cell : buffer) {
        std::cout << std::dec << cell << std::endl;
    }


    std::this_thread::sleep_for(std::chrono::seconds(5));

}

基本上我为每个测试创建了函数，这些函数是运行按顺序排列的。每个阶段的所有线程都会被删除和重新制作。我使用的是 VS2017 自带的 Microsoft C/C++ 编译器，所以至少支持 c++11。我没有更改任何默认编译行，但我确实注意到优化已关闭。不管怎样，所有的编译器选项：

/permissive- /GS /analyze- /W3 /Zc:wchar_t /ZI /Gm /Od /sdl /Fd"Debug\vc141.pdb" /Zc:inline /fp:precise /D "_MBCS " /errorReport:prompt /WX- /Zc:forScope /RTC1 /Gd /Oy- /MDd /Fa"Debug\" /EHsc /nologo /Fo"Debug\" /Fp"Debug\ThreadingTester.pch" /diagnostics:classic

Answer 1

锁定和解锁互斥量的开销可能比对 int 执行简单算术运算的成本大得多，这使得单个锁的情况几乎可以衡量互斥量的开销。从另一个角度来看，使用这种方法的同步成本大于并行化节省的时间。

由于在单锁情况下您几乎没有时间花在锁之外，因此除了一个工作人员外，所有工作人员都在等待锁，这意味着结果大约是单线程方法加上互斥开销的总和。

在衡量性能时非常很难想出一个有代表性的模型。 "best solution" 的构成取决于许多因素。在一个案例中可能最好的方法在另一个看似相似的案例中可能并不理想。最好在实际应用中衡量。

Answer 2

除了已接受的答案外，正在查看您对

的评论

no thread 0.16 / 5 locks 0.18 / 1 lock 0.14 / no locks 0.04

和您的代码，"no thread" 和“1 锁”预计会产生相同的结果。这个0.2的差和你的5把锁是一样的

鉴于值是 ms，这可能与您机器上发生的其他事情无关紧要，毕竟还有更多线程向 OS 请求 CPU 时间和内存带宽.

0.2 毫秒的差异也可能与 CPU 缓存有关，以及一系列我无法立即想到的其他事情，但我强调的是测试的错误是未知;与所有统计数据一样，衡量是一件很重要的事情。

选择锁粒度

Choosing lock granularity

c++

performance

multithreading

mutex

locking