Monte Carlo Tree Search Tic-Tac-Toe -- 可怜的特工
Monte Carlo Tree Search Tic-Tac-Toe -- Poor Agent
我正在尝试实施 Monte Carlo 树搜索以在 Python 中玩井字游戏。我目前的实现如下:
我有一个板子 class 可以处理对井字棋板的更改。棋盘的状态由一个 2x3x3 的 numpy 数组表示,其中 2 个 3x3 矩阵中的每一个都是二进制矩阵,分别表示 X 的存在和 O 的存在。
class Board:
'''
class handling state of the board
'''
def __init__(self):
self.state = np.zeros([2,3,3])
self.player = 0 # current player's turn
def copy(self):
'''
make copy of the board
'''
copy = Board()
copy.player = self.player
copy.state = np.copy(self.state)
return copy
def move(self, move):
'''
take move of form [x,y] and play
the move for the current player
'''
if np.any(self.state[:,move[0],move[1]]): return
self.state[self.player][move[0],move[1]] = 1
self.player ^= 1
def get_moves(self):
'''
return remaining possible board moves
(ie where there are no O's or X's)
'''
return np.argwhere(self.state[0]+self.state[1]==0).tolist()
def result(self):
'''
check rows, columns, and diagonals
for sequence of 3 X's or 3 O's
'''
board = self.state[self.player^1]
col_sum = np.any(np.sum(board,axis=0)==3)
row_sum = np.any(np.sum(board,axis=1)==3)
d1_sum = np.any(np.trace(board)==3)
d2_sum = np.any(np.trace(np.flip(board,1))==3)
return col_sum or row_sum or d1_sum or d2_sum
然后我有一个节点 class,它在构建搜索树时处理节点的属性:
class Node:
'''
maintains state of nodes in
the monte carlo search tree
'''
def __init__(self, parent=None, action=None, board=None):
self.parent = parent
self.board = board
self.children = []
self.wins = 0
self.visits = 0
self.untried_actions = board.get_moves()
self.action = action
def select(self):
'''
select child of node with
highest UCB1 value
'''
s = sorted(self.children, key=lambda c:c.wins/c.visits+0.2*sqrt(2*log(self.visits)/c.visits))
return s[-1]
def expand(self, action, board):
'''
expand parent node (self) by adding child
node with given action and state
'''
child = Node(parent=self, action=action, board=board)
self.untried_actions.remove(action)
self.children.append(child)
return child
def update(self, result):
self.visits += 1
self.wins += result
最后,我有了 UCT 功能,可以将所有内容整合在一起。此函数采用棋盘对象并构建 Monte Carlo 搜索树以确定给定棋盘状态的下一个最佳可能移动:
def UCT(rootstate, maxiters):
root = Node(board=rootstate)
for i in range(maxiters):
node = root
board = rootstate.copy()
# selection - select best child if parent fully expanded and not terminal
while node.untried_actions == [] and node.children != []:
node = node.select()
board.move(node.action)
# expansion - expand parent to a random untried action
if node.untried_actions != []:
a = random.choice(node.untried_actions)
board.move(a)
node = node.expand(a, board.copy())
# simulation - rollout to terminal state from current
# state using random actions
while board.get_moves() != [] and not board.result():
board.move(random.choice(board.get_moves()))
# backpropagation - propagate result of rollout game up the tree
# reverse the result if player at the node lost the rollout game
while node != None:
result = board.result()
if result:
if node.board.player==board.player:
result = 1
else: result = -1
else: result = 0
node.update(result)
node = node.parent
s = sorted(root.children, key=lambda c:c.wins/c.visits)
return s[-1].action
我已经搜索了这段代码几个小时,但在我的实现中根本找不到错误。我已经测试了许多棋盘状态,并让两个代理相互对抗,但即使是最简单的棋盘状态,函数 returns 的动作也很差。我错过了什么 and/or 我的实现有什么问题?
编辑:这里是两个代理如何实现播放的示例:
b = Board() # instantiate board
# while there are moves left to play and neither player has won
while b.get_moves() != [] and not b.result():
a = UCT(b,1000) # get next best move
b.move(a) # make move
print(b.state) # show state
问题如下:
- 您的
get_moves()
函数不会检查游戏是否已经结束。它可以为某人已经获胜的州生成 non-empty 移动列表。
- 创建新的
Node
时,您也不会检查游戏状态是否已经结束,因此会创建 non-empty 个 untried_actions
列表。
- 在算法的选择 和扩展 阶段,您也不检查终端游戏状态。一旦 Expansion 阶段命中了一个包含某人已经获胜的游戏状态的节点,它将愉快地应用一个额外的动作并再次为树创建一个新节点,随后 选择阶段也将愉快地继续进行。
- 对于在某人已经获胜后游戏继续进行的这些节点,
result()
可以 return 一个错误的获胜者。它只是检查最近下棋的玩家是否获胜,如果您在有人获胜后立即停止游戏,这是正确的,但如果您在某人已经获胜后继续游戏,则可能是不正确的。因此,您通过树传播各种不正确的结果。
解决此问题的最简单方法是修改 get_moves()
,使其 return 在游戏结束时成为一个空列表。然后,这些节点将始终无法通过 if node.untried_actions != []
检查,这意味着扩展阶段将被完全跳过,您直接进入 Play-out 阶段正确检查终端游戏状态。这可以按如下方式完成:
def get_moves(self):
"""
return remaining possible board moves
(ie where there are no O's or X's)
"""
if self.result():
return []
return np.argwhere(self.state[0] + self.state[1] == 0).tolist()
我正在尝试实施 Monte Carlo 树搜索以在 Python 中玩井字游戏。我目前的实现如下:
我有一个板子 class 可以处理对井字棋板的更改。棋盘的状态由一个 2x3x3 的 numpy 数组表示,其中 2 个 3x3 矩阵中的每一个都是二进制矩阵,分别表示 X 的存在和 O 的存在。
class Board:
'''
class handling state of the board
'''
def __init__(self):
self.state = np.zeros([2,3,3])
self.player = 0 # current player's turn
def copy(self):
'''
make copy of the board
'''
copy = Board()
copy.player = self.player
copy.state = np.copy(self.state)
return copy
def move(self, move):
'''
take move of form [x,y] and play
the move for the current player
'''
if np.any(self.state[:,move[0],move[1]]): return
self.state[self.player][move[0],move[1]] = 1
self.player ^= 1
def get_moves(self):
'''
return remaining possible board moves
(ie where there are no O's or X's)
'''
return np.argwhere(self.state[0]+self.state[1]==0).tolist()
def result(self):
'''
check rows, columns, and diagonals
for sequence of 3 X's or 3 O's
'''
board = self.state[self.player^1]
col_sum = np.any(np.sum(board,axis=0)==3)
row_sum = np.any(np.sum(board,axis=1)==3)
d1_sum = np.any(np.trace(board)==3)
d2_sum = np.any(np.trace(np.flip(board,1))==3)
return col_sum or row_sum or d1_sum or d2_sum
然后我有一个节点 class,它在构建搜索树时处理节点的属性:
class Node:
'''
maintains state of nodes in
the monte carlo search tree
'''
def __init__(self, parent=None, action=None, board=None):
self.parent = parent
self.board = board
self.children = []
self.wins = 0
self.visits = 0
self.untried_actions = board.get_moves()
self.action = action
def select(self):
'''
select child of node with
highest UCB1 value
'''
s = sorted(self.children, key=lambda c:c.wins/c.visits+0.2*sqrt(2*log(self.visits)/c.visits))
return s[-1]
def expand(self, action, board):
'''
expand parent node (self) by adding child
node with given action and state
'''
child = Node(parent=self, action=action, board=board)
self.untried_actions.remove(action)
self.children.append(child)
return child
def update(self, result):
self.visits += 1
self.wins += result
最后,我有了 UCT 功能,可以将所有内容整合在一起。此函数采用棋盘对象并构建 Monte Carlo 搜索树以确定给定棋盘状态的下一个最佳可能移动:
def UCT(rootstate, maxiters):
root = Node(board=rootstate)
for i in range(maxiters):
node = root
board = rootstate.copy()
# selection - select best child if parent fully expanded and not terminal
while node.untried_actions == [] and node.children != []:
node = node.select()
board.move(node.action)
# expansion - expand parent to a random untried action
if node.untried_actions != []:
a = random.choice(node.untried_actions)
board.move(a)
node = node.expand(a, board.copy())
# simulation - rollout to terminal state from current
# state using random actions
while board.get_moves() != [] and not board.result():
board.move(random.choice(board.get_moves()))
# backpropagation - propagate result of rollout game up the tree
# reverse the result if player at the node lost the rollout game
while node != None:
result = board.result()
if result:
if node.board.player==board.player:
result = 1
else: result = -1
else: result = 0
node.update(result)
node = node.parent
s = sorted(root.children, key=lambda c:c.wins/c.visits)
return s[-1].action
我已经搜索了这段代码几个小时,但在我的实现中根本找不到错误。我已经测试了许多棋盘状态,并让两个代理相互对抗,但即使是最简单的棋盘状态,函数 returns 的动作也很差。我错过了什么 and/or 我的实现有什么问题?
编辑:这里是两个代理如何实现播放的示例:
b = Board() # instantiate board
# while there are moves left to play and neither player has won
while b.get_moves() != [] and not b.result():
a = UCT(b,1000) # get next best move
b.move(a) # make move
print(b.state) # show state
问题如下:
- 您的
get_moves()
函数不会检查游戏是否已经结束。它可以为某人已经获胜的州生成 non-empty 移动列表。 - 创建新的
Node
时,您也不会检查游戏状态是否已经结束,因此会创建 non-empty 个untried_actions
列表。 - 在算法的选择 和扩展 阶段,您也不检查终端游戏状态。一旦 Expansion 阶段命中了一个包含某人已经获胜的游戏状态的节点,它将愉快地应用一个额外的动作并再次为树创建一个新节点,随后 选择阶段也将愉快地继续进行。
- 对于在某人已经获胜后游戏继续进行的这些节点,
result()
可以 return 一个错误的获胜者。它只是检查最近下棋的玩家是否获胜,如果您在有人获胜后立即停止游戏,这是正确的,但如果您在某人已经获胜后继续游戏,则可能是不正确的。因此,您通过树传播各种不正确的结果。
解决此问题的最简单方法是修改 get_moves()
,使其 return 在游戏结束时成为一个空列表。然后,这些节点将始终无法通过 if node.untried_actions != []
检查,这意味着扩展阶段将被完全跳过,您直接进入 Play-out 阶段正确检查终端游戏状态。这可以按如下方式完成:
def get_moves(self):
"""
return remaining possible board moves
(ie where there are no O's or X's)
"""
if self.result():
return []
return np.argwhere(self.state[0] + self.state[1] == 0).tolist()