简单的强化学习示例
Simple Reinforcement Learning example
我尝试基于现有的 Gym 和 Malmo 示例创建一个简化的 rl4j 示例。
给出的是一个正弦波,AI 应该说我们是在波浪的顶部、底部还是其他地方 (noop)。
SineRider 是 "Game",
状态是正弦函数的值(只是一个双)
问题是它从不调用SineRider中的step函数来获得奖励。我做错了什么?
科特林:
package aiexample
import org.deeplearning4j.gym.StepReply
import org.deeplearning4j.rl4j.learning.sync.qlearning.QLearning
import org.deeplearning4j.rl4j.learning.sync.qlearning.discrete.QLearningDiscreteDense
import org.deeplearning4j.rl4j.mdp.MDP
import org.deeplearning4j.rl4j.network.dqn.DQNFactoryStdDense
import org.deeplearning4j.rl4j.space.DiscreteSpace
import org.deeplearning4j.rl4j.space.Encodable
import org.deeplearning4j.rl4j.space.ObservationSpace
import org.nd4j.linalg.api.ndarray.INDArray
import org.nd4j.linalg.factory.Nd4j
import org.nd4j.linalg.learning.config.Adam
import kotlin.math.sin
object Example {
var ql: QLearning.QLConfiguration = QLearning.QLConfiguration(
123, //Random seed
1000, //Max step By epoch
8000, //Max step
1000, //Max size of experience replay
32, //size of batches
100, //target update (hard)
0, //num step noop warmup
0.05, //reward scaling
0.99, //gamma
10.0, //td-error clipping
0.1f, //min epsilon
2000, //num step for eps greedy anneal
true //double DQN
)
var net: DQNFactoryStdDense.Configuration = DQNFactoryStdDense.Configuration.builder()
.l2(0.01).updater(Adam(1e-2)).numLayer(3).numHiddenNodes(16).build()
@JvmStatic
fun main(args: Array<String>) {
simpleSine()
}
private fun simpleSine() {
val mdp = Env.create()
val dql = QLearningDiscreteDense(mdp, net, ql)
dql.train()
mdp.close()
}
}
class Action(val name:String) {
companion object {
val noop = Action("noop")
val top = Action("top")
val bottom = Action("bottom")
}
}
class State(private val inputs: DoubleArray): Encodable {
override fun toArray(): DoubleArray {
return inputs
}
}
class SineObservationSpace: ObservationSpace<State> {
override fun getLow(): INDArray {
return Nd4j.create(doubleArrayOf(-1.0))
}
override fun getHigh(): INDArray {
return Nd4j.create(doubleArrayOf(1.0))
}
override fun getName(): String {
return "Discrete"
}
override fun getShape(): IntArray {
return intArrayOf(1)
}
}
class SineRider{
companion object {
val actions = mapOf(
0 to Action.noop,
1 to Action.top,
2 to Action.bottom)
}
var i = 0.0
fun step(action:Int): Double{
val act = actions[action]
if(act == Action.top){
return if(i > 0.9) 1.0 else -1.0
}
if(act == Action.bottom){
return if(i < -0.9) 1.0 else -1.0
}
if(act == Action.noop){
return if(i < 0.9 && i > -0.9) 1.0 else -1.0
}
return 0.0
}
fun reset(){
}
fun next(){
i += 0.1
}
fun state(): State {
val sine = sin(i)
next()
return State(arrayOf(sine).toDoubleArray())
}
}
class Env(private val sineRider: SineRider) : MDP<State, Int, DiscreteSpace> {
private val actionSpace = DiscreteSpace(3)
private var done = false
override fun getObservationSpace(): ObservationSpace<State> {
return SineObservationSpace()
}
override fun getActionSpace(): DiscreteSpace {
return actionSpace
}
override fun step(action: Int): StepReply<State> {
val reward = sineRider.step(action)
val state = sineRider.state()
return StepReply(state, reward, true, null)
}
override fun isDone(): Boolean {
return true
}
override fun reset(): State? {
done = false
sineRider.reset()
return sineRider.state()
}
override fun close() {
}
override fun newInstance(): Env {
return create()
}
companion object {
fun create() : Env {
val sinRider = SineRider()
return Env(sinRider)
}
}
}
问题出在 isDone() 函数上。它总是说游戏结束了。
代码更改:
class Env...
var stepCount = 0
override fun isDone(): Boolean {
return stepCount > 1000
}
override fun reset(): State? {
stepCount = 0
...
}
感谢 Paul Dubs ->
https://community.konduit.ai/t/simplified-example/621
我尝试基于现有的 Gym 和 Malmo 示例创建一个简化的 rl4j 示例。 给出的是一个正弦波,AI 应该说我们是在波浪的顶部、底部还是其他地方 (noop)。
SineRider 是 "Game", 状态是正弦函数的值(只是一个双)
问题是它从不调用SineRider中的step函数来获得奖励。我做错了什么?
科特林:
package aiexample
import org.deeplearning4j.gym.StepReply
import org.deeplearning4j.rl4j.learning.sync.qlearning.QLearning
import org.deeplearning4j.rl4j.learning.sync.qlearning.discrete.QLearningDiscreteDense
import org.deeplearning4j.rl4j.mdp.MDP
import org.deeplearning4j.rl4j.network.dqn.DQNFactoryStdDense
import org.deeplearning4j.rl4j.space.DiscreteSpace
import org.deeplearning4j.rl4j.space.Encodable
import org.deeplearning4j.rl4j.space.ObservationSpace
import org.nd4j.linalg.api.ndarray.INDArray
import org.nd4j.linalg.factory.Nd4j
import org.nd4j.linalg.learning.config.Adam
import kotlin.math.sin
object Example {
var ql: QLearning.QLConfiguration = QLearning.QLConfiguration(
123, //Random seed
1000, //Max step By epoch
8000, //Max step
1000, //Max size of experience replay
32, //size of batches
100, //target update (hard)
0, //num step noop warmup
0.05, //reward scaling
0.99, //gamma
10.0, //td-error clipping
0.1f, //min epsilon
2000, //num step for eps greedy anneal
true //double DQN
)
var net: DQNFactoryStdDense.Configuration = DQNFactoryStdDense.Configuration.builder()
.l2(0.01).updater(Adam(1e-2)).numLayer(3).numHiddenNodes(16).build()
@JvmStatic
fun main(args: Array<String>) {
simpleSine()
}
private fun simpleSine() {
val mdp = Env.create()
val dql = QLearningDiscreteDense(mdp, net, ql)
dql.train()
mdp.close()
}
}
class Action(val name:String) {
companion object {
val noop = Action("noop")
val top = Action("top")
val bottom = Action("bottom")
}
}
class State(private val inputs: DoubleArray): Encodable {
override fun toArray(): DoubleArray {
return inputs
}
}
class SineObservationSpace: ObservationSpace<State> {
override fun getLow(): INDArray {
return Nd4j.create(doubleArrayOf(-1.0))
}
override fun getHigh(): INDArray {
return Nd4j.create(doubleArrayOf(1.0))
}
override fun getName(): String {
return "Discrete"
}
override fun getShape(): IntArray {
return intArrayOf(1)
}
}
class SineRider{
companion object {
val actions = mapOf(
0 to Action.noop,
1 to Action.top,
2 to Action.bottom)
}
var i = 0.0
fun step(action:Int): Double{
val act = actions[action]
if(act == Action.top){
return if(i > 0.9) 1.0 else -1.0
}
if(act == Action.bottom){
return if(i < -0.9) 1.0 else -1.0
}
if(act == Action.noop){
return if(i < 0.9 && i > -0.9) 1.0 else -1.0
}
return 0.0
}
fun reset(){
}
fun next(){
i += 0.1
}
fun state(): State {
val sine = sin(i)
next()
return State(arrayOf(sine).toDoubleArray())
}
}
class Env(private val sineRider: SineRider) : MDP<State, Int, DiscreteSpace> {
private val actionSpace = DiscreteSpace(3)
private var done = false
override fun getObservationSpace(): ObservationSpace<State> {
return SineObservationSpace()
}
override fun getActionSpace(): DiscreteSpace {
return actionSpace
}
override fun step(action: Int): StepReply<State> {
val reward = sineRider.step(action)
val state = sineRider.state()
return StepReply(state, reward, true, null)
}
override fun isDone(): Boolean {
return true
}
override fun reset(): State? {
done = false
sineRider.reset()
return sineRider.state()
}
override fun close() {
}
override fun newInstance(): Env {
return create()
}
companion object {
fun create() : Env {
val sinRider = SineRider()
return Env(sinRider)
}
}
}
问题出在 isDone() 函数上。它总是说游戏结束了。
代码更改:
class Env...
var stepCount = 0
override fun isDone(): Boolean {
return stepCount > 1000
}
override fun reset(): State? {
stepCount = 0
...
}
感谢 Paul Dubs -> https://community.konduit.ai/t/simplified-example/621