使用 softmax 进行动作选择?
Action selection with softmax?
我知道这可能是一个非常愚蠢的问题,但到底是什么..
我目前正在尝试实施软最大动作 select或者,它使用玻尔兹曼分布。
我有点不确定的是,如果您想使用特定操作,如何知道?
我的意思是该函数为我提供了一个概率?但我如何使用它来 select 我想执行哪个动作?
对于某些机器学习应用程序,有时需要将一组原始输出(例如来自神经网络的输出)映射到一组概率,归一化为总和为 1。
在强化学习中,可能需要将一组可用动作的权重映射到一组相关联的概率,然后随机使用这些概率select采取下一个动作。
Softmax 函数通常用于将输出权重映射到一组相应的概率。 "temperature" 参数允许调整 selection 策略,在纯开发("greedy" 策略,其中始终选择最高权重的动作)和纯探索(其中每个动作被选中的概率相同)。
这是一个使用Softmax函数的简单例子。每个 "action" 对应于此代码中传递的 vector<double>
个对象中的一个索引条目。
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <cmath>
using std::vector;
// The temperature parameter here might be 1/temperature seen elsewhere.
// Here, lower temperatures move the highest-weighted output
// toward a probability of 1.0.
// And higer temperatures tend to even out all the probabilities,
// toward 1/<entry count>.
// temperature's range is between 0 and +Infinity (excluding these
// two extremes).
vector<double> Softmax(const vector<double>& weights, double temperature) {
vector<double> probs;
double sum = 0;
for(auto weight : weights) {
double pr = std::exp(weight/temperature);
sum += pr;
probs.push_back(pr);
}
for(auto& pr : probs) {
pr /= sum;
}
return probs;
}
// Rng class encapsulates random number generation
// of double values uniformly distributed between 0 and 1,
// in case you need to replace std's <random> with something else.
struct Rng {
std::mt19937 engine;
std::uniform_real_distribution<double> distribution;
Rng() : distribution(0,1) {
std::random_device rd;
engine.seed(rd());
}
double operator ()() {
return distribution(engine);
}
};
// Selects one index out of a vector of probabilities, "probs"
// The sum of all elements in "probs" must be 1.
vector<double>::size_type StochasticSelection(const vector<double>& probs) {
// The unit interval is divided into sub-intervals, one for each
// entry in "probs". Each sub-interval's size is proportional
// to its corresponding probability.
// You can imagine a roulette wheel divided into differently-sized
// slots for each entry. An entry's slot size is proportional to
// its probability and all the entries' slots combine to fill
// the entire roulette wheel.
// The roulette "ball"'s final location on the wheel is determined
// by generating a (pseudo)random value between 0 and 1.
// Then a linear search finds the entry whose sub-interval contains
// this value. Finally, the selected entry's index is returned.
static Rng rng;
const double point = rng();
double cur_cutoff = 0;
for(vector<double>::size_type i=0; i<probs.size()-1; ++i) {
cur_cutoff += probs[i];
if(point < cur_cutoff) return i;
}
return probs.size()-1;
}
void DumpSelections(const vector<double>& probs, int sample_count) {
for(int i=0; i<sample_count; ++i) {
auto selection = StochasticSelection(probs);
std::cout << " " << selection;
}
std::cout << '\n';
}
void DumpDist(const vector<double>& probs) {
auto flags = std::cout.flags();
std::cout.precision(2);
for(vector<double>::size_type i=0; i<probs.size(); ++i) {
if(i) std::cout << " ";
std::cout << std::setw(2) << i << ':' << std::setw(8) << probs[i];
}
std::cout.flags(flags);
std::cout << '\n';
}
int main() {
vector<double> weights = {1.0, 2, 6, -2.5, 0};
std::cout << "Original weights:\n";
for(vector<double>::size_type i=0; i<weights.size(); ++i) {
std::cout << " " << i << ':' << weights[i];
}
std::cout << "\n\nSoftmax mappings for different temperatures:\n";
auto softmax_thalf = Softmax(weights, 0.5);
auto softmax_t1 = Softmax(weights, 1);
auto softmax_t2 = Softmax(weights, 2);
auto softmax_t10 = Softmax(weights, 10);
std::cout << "[Temp 1/2] ";
DumpDist(softmax_thalf);
std::cout << "[Temp 1] ";
DumpDist(softmax_t1);
std::cout << "[Temp 2] ";
DumpDist(softmax_t2);
std::cout << "[Temp 10] ";
DumpDist(softmax_t10);
std::cout << "\nSelections from softmax_t1:\n";
DumpSelections(softmax_t1, 20);
std::cout << "\nSelections from softmax_t2:\n";
DumpSelections(softmax_t2, 20);
std::cout << "\nSelections from softmax_t10:\n";
DumpSelections(softmax_t10, 20);
}
这是一个输出示例:
Original weights:
0:1 1:2 2:6 3:-2.5 4:0
Softmax mappings for different temperatures:
[Temp 1/2] 0: 4.5e-05 1: 0.00034 2: 1 3: 4.1e-08 4: 6.1e-06
[Temp 1] 0: 0.0066 1: 0.018 2: 0.97 3: 0.0002 4: 0.0024
[Temp 2] 0: 0.064 1: 0.11 2: 0.78 3: 0.011 4: 0.039
[Temp 10] 0: 0.19 1: 0.21 2: 0.31 3: 0.13 4: 0.17
Selections from softmax_t1:
2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 1
Selections from softmax_t2:
2 2 2 2 2 2 1 2 2 1 2 2 2 1 2 2 2 2 2 1
Selections from softmax_t10:
0 0 4 1 2 2 2 0 0 1 3 4 2 2 4 3 2 1 0 1
我知道这可能是一个非常愚蠢的问题,但到底是什么..
我目前正在尝试实施软最大动作 select或者,它使用玻尔兹曼分布。
我有点不确定的是,如果您想使用特定操作,如何知道? 我的意思是该函数为我提供了一个概率?但我如何使用它来 select 我想执行哪个动作?
对于某些机器学习应用程序,有时需要将一组原始输出(例如来自神经网络的输出)映射到一组概率,归一化为总和为 1。
在强化学习中,可能需要将一组可用动作的权重映射到一组相关联的概率,然后随机使用这些概率select采取下一个动作。
Softmax 函数通常用于将输出权重映射到一组相应的概率。 "temperature" 参数允许调整 selection 策略,在纯开发("greedy" 策略,其中始终选择最高权重的动作)和纯探索(其中每个动作被选中的概率相同)。
这是一个使用Softmax函数的简单例子。每个 "action" 对应于此代码中传递的 vector<double>
个对象中的一个索引条目。
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <cmath>
using std::vector;
// The temperature parameter here might be 1/temperature seen elsewhere.
// Here, lower temperatures move the highest-weighted output
// toward a probability of 1.0.
// And higer temperatures tend to even out all the probabilities,
// toward 1/<entry count>.
// temperature's range is between 0 and +Infinity (excluding these
// two extremes).
vector<double> Softmax(const vector<double>& weights, double temperature) {
vector<double> probs;
double sum = 0;
for(auto weight : weights) {
double pr = std::exp(weight/temperature);
sum += pr;
probs.push_back(pr);
}
for(auto& pr : probs) {
pr /= sum;
}
return probs;
}
// Rng class encapsulates random number generation
// of double values uniformly distributed between 0 and 1,
// in case you need to replace std's <random> with something else.
struct Rng {
std::mt19937 engine;
std::uniform_real_distribution<double> distribution;
Rng() : distribution(0,1) {
std::random_device rd;
engine.seed(rd());
}
double operator ()() {
return distribution(engine);
}
};
// Selects one index out of a vector of probabilities, "probs"
// The sum of all elements in "probs" must be 1.
vector<double>::size_type StochasticSelection(const vector<double>& probs) {
// The unit interval is divided into sub-intervals, one for each
// entry in "probs". Each sub-interval's size is proportional
// to its corresponding probability.
// You can imagine a roulette wheel divided into differently-sized
// slots for each entry. An entry's slot size is proportional to
// its probability and all the entries' slots combine to fill
// the entire roulette wheel.
// The roulette "ball"'s final location on the wheel is determined
// by generating a (pseudo)random value between 0 and 1.
// Then a linear search finds the entry whose sub-interval contains
// this value. Finally, the selected entry's index is returned.
static Rng rng;
const double point = rng();
double cur_cutoff = 0;
for(vector<double>::size_type i=0; i<probs.size()-1; ++i) {
cur_cutoff += probs[i];
if(point < cur_cutoff) return i;
}
return probs.size()-1;
}
void DumpSelections(const vector<double>& probs, int sample_count) {
for(int i=0; i<sample_count; ++i) {
auto selection = StochasticSelection(probs);
std::cout << " " << selection;
}
std::cout << '\n';
}
void DumpDist(const vector<double>& probs) {
auto flags = std::cout.flags();
std::cout.precision(2);
for(vector<double>::size_type i=0; i<probs.size(); ++i) {
if(i) std::cout << " ";
std::cout << std::setw(2) << i << ':' << std::setw(8) << probs[i];
}
std::cout.flags(flags);
std::cout << '\n';
}
int main() {
vector<double> weights = {1.0, 2, 6, -2.5, 0};
std::cout << "Original weights:\n";
for(vector<double>::size_type i=0; i<weights.size(); ++i) {
std::cout << " " << i << ':' << weights[i];
}
std::cout << "\n\nSoftmax mappings for different temperatures:\n";
auto softmax_thalf = Softmax(weights, 0.5);
auto softmax_t1 = Softmax(weights, 1);
auto softmax_t2 = Softmax(weights, 2);
auto softmax_t10 = Softmax(weights, 10);
std::cout << "[Temp 1/2] ";
DumpDist(softmax_thalf);
std::cout << "[Temp 1] ";
DumpDist(softmax_t1);
std::cout << "[Temp 2] ";
DumpDist(softmax_t2);
std::cout << "[Temp 10] ";
DumpDist(softmax_t10);
std::cout << "\nSelections from softmax_t1:\n";
DumpSelections(softmax_t1, 20);
std::cout << "\nSelections from softmax_t2:\n";
DumpSelections(softmax_t2, 20);
std::cout << "\nSelections from softmax_t10:\n";
DumpSelections(softmax_t10, 20);
}
这是一个输出示例:
Original weights:
0:1 1:2 2:6 3:-2.5 4:0
Softmax mappings for different temperatures:
[Temp 1/2] 0: 4.5e-05 1: 0.00034 2: 1 3: 4.1e-08 4: 6.1e-06
[Temp 1] 0: 0.0066 1: 0.018 2: 0.97 3: 0.0002 4: 0.0024
[Temp 2] 0: 0.064 1: 0.11 2: 0.78 3: 0.011 4: 0.039
[Temp 10] 0: 0.19 1: 0.21 2: 0.31 3: 0.13 4: 0.17
Selections from softmax_t1:
2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 1
Selections from softmax_t2:
2 2 2 2 2 2 1 2 2 1 2 2 2 1 2 2 2 2 2 1
Selections from softmax_t10:
0 0 4 1 2 2 2 0 0 1 3 4 2 2 4 3 2 1 0 1