MCTS 代理在 Tic-Tac-Toe 上做出了错误的决定
MCTS Agent making bad decisions on Tic-Tac-Toe
我已经在 MCTS AI 上工作了几天。我尝试在 Tic-Tac-Toe 上实现它,这是我能想到的最不复杂的游戏,但出于某种原因,我的 AI 总是做出错误的决定。我已经尝试更改 UCB1 的探索常数的值、每次搜索的迭代次数,甚至是获胜、失败和平局的分数(试图让平局更有价值,因为这个 AI 只玩第二个,并尝试获得平局,否则获胜)。截至目前,代码如下所示:
import random
import math
import copy
class tree:
def __init__(self, board):
self.board = board
self.visits = 0
self.score = 0
self.children = []
class mcts:
def search(self, mx, player,):
root = tree(mx)
for i in range(1200):
leaf = mcts.expand(self, root.board, player, root)
result = mcts.rollout(self, leaf)
mcts.backpropagate(self, leaf, root, result)
return mcts.best_child(self, root).board
def expand(self, mx, player, root):
plays = mcts.generate_states(self, mx, player) #all possible plays
if root.visits == 0:
for j in plays:
root.children.append(j) #create child_nodes in case they havent been created yet
for j in root.children:
if j.visits == 0:
return j #first iterations of the loop
for j in plays:
if mcts.final(self, j.board, player):
return j
return mcts.best_child(self, root) #choose the one with most potential
def rollout(self, leaf):
mx = leaf.board
aux = 1
while mcts.final(self, mx, "O") != True:
if aux == 1: # "X" playing
possible_states = []
possible_nodes = mcts.generate_states(self, mx, "X")
for i in possible_nodes:
possible_states.append(i.board)
if len(possible_states) == 1: mx = possible_states[0]
else:
choice = random.randrange(0, len(possible_states) - 1)
mx = possible_states[choice]
if mcts.final(self, mx, "X"): #The play by "X" finished the game
break
elif aux == 0: # "O" playing
possible_states = []
possible_nodes = mcts.generate_states(self, mx, "O")
for i in possible_nodes:
possible_states.append(i.board)
if len(possible_states) == 1: mx = possible_states[0]
else:
choice = random.randrange(0, len(possible_states) - 1)
mx = possible_states[choice]
aux += 1
aux = aux%2
if mcts.final(self, mx, "X"):
for i in range(len(mx)):
for k in range(len(mx[i])):
if mx[i][k] == "-":
return -1 #loss
return 0 #tie
elif mcts.final(self, mx, "O"):
for i in range(len(mx)):
for k in range(len(mx[i])):
if mx[i][k] == "-":
return 1 #win
def backpropagate(self, leaf, root, result): # updating our prospects stats
leaf.score += result
leaf.visits += 1
root.visits += 1
def generate_states(self, mx, player):
possible_states = [] #generate child_nodes
for i in range(len(mx)):
for k in range(len(mx[i])):
if mx[i][k] == "-":
option = copy.deepcopy(mx)
option[i][k] = player
child_node = tree(option)
possible_states.append(child_node)
return possible_states
def final(self,mx, player): #check if game is won
possible_draw = True
win = False
for i in mx: #lines
if i == [player, player, player]:
win = True
possible_draw = False
if mx[0][0] == player: #diagonals
if mx[1][1] == player:
if mx[2][2] == player:
win = True
possible_draw = False
if mx[0][2] == player:
if mx[1][1] == player:
if mx[2][0] == player:
win = True
possible_draw = False
for i in range(3): #columns
if mx[0][i] == player and mx[1][i] == player and mx[2][i] == player:
win = True
possible_draw = False
for i in range(3):
for k in range(3):
if mx[i][k] == "-":
possible_draw = False
if possible_draw:
return possible_draw
return win
def calculate_score(self, score, child_visits, parent_visits, c): #UCB1
return score / child_visits + c * math.sqrt(math.log(parent_visits) / child_visits)
def best_child(self, root): #returns most promising node
treshold = -1*10**6
for j in root.children:
potential = mcts.calculate_score(self, j.score, j.visits, root.visits, 2)
if potential > treshold:
win_choice = j
treshold = potential
return win_choice
#todo the AI takes too long for each play, optimize that by finding the optimal approach in the rollout phase
首先,这个 AI 的目的是 return 一个改变的矩阵,在那种情况下他可以做出最好的表现。我发现自己质疑 MCTS 算法是否是所有这些失败游戏背后的原因,因为它在实施中可能存在一些错误。话虽如此,在我看来,代码执行以下操作:
- 检查根是否已经有它的children,如果有,选择最有希望的。
- 展开随机模拟并保存结果。
- 更新叶子的分数、访问次数和根的访问次数。
- 重复 1200 次迭代,在我的示例中
- Return 可能的最佳着法(矩阵,child_node)。
为什么不起作用?为什么它选择糟糕的剧本而不是最佳的剧本?算法实现有误吗?
我的错误是在扩展阶段选择了访问次数最多的节点,而根据 UCB1 公式,它应该是最有潜力的节点。我在执行一些 if 子句时也有一些错误,因为所有的损失都没有计算在内。
我已经在 MCTS AI 上工作了几天。我尝试在 Tic-Tac-Toe 上实现它,这是我能想到的最不复杂的游戏,但出于某种原因,我的 AI 总是做出错误的决定。我已经尝试更改 UCB1 的探索常数的值、每次搜索的迭代次数,甚至是获胜、失败和平局的分数(试图让平局更有价值,因为这个 AI 只玩第二个,并尝试获得平局,否则获胜)。截至目前,代码如下所示:
import random
import math
import copy
class tree:
def __init__(self, board):
self.board = board
self.visits = 0
self.score = 0
self.children = []
class mcts:
def search(self, mx, player,):
root = tree(mx)
for i in range(1200):
leaf = mcts.expand(self, root.board, player, root)
result = mcts.rollout(self, leaf)
mcts.backpropagate(self, leaf, root, result)
return mcts.best_child(self, root).board
def expand(self, mx, player, root):
plays = mcts.generate_states(self, mx, player) #all possible plays
if root.visits == 0:
for j in plays:
root.children.append(j) #create child_nodes in case they havent been created yet
for j in root.children:
if j.visits == 0:
return j #first iterations of the loop
for j in plays:
if mcts.final(self, j.board, player):
return j
return mcts.best_child(self, root) #choose the one with most potential
def rollout(self, leaf):
mx = leaf.board
aux = 1
while mcts.final(self, mx, "O") != True:
if aux == 1: # "X" playing
possible_states = []
possible_nodes = mcts.generate_states(self, mx, "X")
for i in possible_nodes:
possible_states.append(i.board)
if len(possible_states) == 1: mx = possible_states[0]
else:
choice = random.randrange(0, len(possible_states) - 1)
mx = possible_states[choice]
if mcts.final(self, mx, "X"): #The play by "X" finished the game
break
elif aux == 0: # "O" playing
possible_states = []
possible_nodes = mcts.generate_states(self, mx, "O")
for i in possible_nodes:
possible_states.append(i.board)
if len(possible_states) == 1: mx = possible_states[0]
else:
choice = random.randrange(0, len(possible_states) - 1)
mx = possible_states[choice]
aux += 1
aux = aux%2
if mcts.final(self, mx, "X"):
for i in range(len(mx)):
for k in range(len(mx[i])):
if mx[i][k] == "-":
return -1 #loss
return 0 #tie
elif mcts.final(self, mx, "O"):
for i in range(len(mx)):
for k in range(len(mx[i])):
if mx[i][k] == "-":
return 1 #win
def backpropagate(self, leaf, root, result): # updating our prospects stats
leaf.score += result
leaf.visits += 1
root.visits += 1
def generate_states(self, mx, player):
possible_states = [] #generate child_nodes
for i in range(len(mx)):
for k in range(len(mx[i])):
if mx[i][k] == "-":
option = copy.deepcopy(mx)
option[i][k] = player
child_node = tree(option)
possible_states.append(child_node)
return possible_states
def final(self,mx, player): #check if game is won
possible_draw = True
win = False
for i in mx: #lines
if i == [player, player, player]:
win = True
possible_draw = False
if mx[0][0] == player: #diagonals
if mx[1][1] == player:
if mx[2][2] == player:
win = True
possible_draw = False
if mx[0][2] == player:
if mx[1][1] == player:
if mx[2][0] == player:
win = True
possible_draw = False
for i in range(3): #columns
if mx[0][i] == player and mx[1][i] == player and mx[2][i] == player:
win = True
possible_draw = False
for i in range(3):
for k in range(3):
if mx[i][k] == "-":
possible_draw = False
if possible_draw:
return possible_draw
return win
def calculate_score(self, score, child_visits, parent_visits, c): #UCB1
return score / child_visits + c * math.sqrt(math.log(parent_visits) / child_visits)
def best_child(self, root): #returns most promising node
treshold = -1*10**6
for j in root.children:
potential = mcts.calculate_score(self, j.score, j.visits, root.visits, 2)
if potential > treshold:
win_choice = j
treshold = potential
return win_choice
#todo the AI takes too long for each play, optimize that by finding the optimal approach in the rollout phase
首先,这个 AI 的目的是 return 一个改变的矩阵,在那种情况下他可以做出最好的表现。我发现自己质疑 MCTS 算法是否是所有这些失败游戏背后的原因,因为它在实施中可能存在一些错误。话虽如此,在我看来,代码执行以下操作:
- 检查根是否已经有它的children,如果有,选择最有希望的。
- 展开随机模拟并保存结果。
- 更新叶子的分数、访问次数和根的访问次数。
- 重复 1200 次迭代,在我的示例中
- Return 可能的最佳着法(矩阵,child_node)。
为什么不起作用?为什么它选择糟糕的剧本而不是最佳的剧本?算法实现有误吗?
我的错误是在扩展阶段选择了访问次数最多的节点,而根据 UCB1 公式,它应该是最有潜力的节点。我在执行一些 if 子句时也有一些错误,因为所有的损失都没有计算在内。