Source code for ast_toolbox.mcts.AdaptiveStressTestingBlindValue

import numpy as np

from ast_toolbox.mcts.AdaptiveStressTesting import AdaptiveStressTest
from ast_toolbox.mcts.AdaptiveStressTesting import ASTAction


[docs]class AdaptiveStressTestBV(AdaptiveStressTest): """The AST wrapper for MCTS using the Blind Value exploration [1]_. Parameters ---------- kwargs : Keyword arguments passed to `ast_toolbox.mcts.AdaptiveStressTesting.AdaptiveStressTest` References ---------- .. [1] Couetoux, Adrien, Hassen Doghmen, and Olivier Teytaud. "Improving the exploration in upper confidence trees." International Conference on Learning and Intelligent Optimization. Springer, Berlin, Heidelberg, 2012. """ def __init__(self, **kwargs): super(AdaptiveStressTestBV, self).__init__(**kwargs)
[docs] def explore_action(self, s, tree): """Sample an action for the exploration using Blind Value. Parameters ---------- s : :py:class:`ast_toolbox.mcts.AdaptiveStressTesting.ASTState` The current state. tree : dict The searching tree. Returns ---------- action : :py:class:`ast_toolbox.mcts.AdaptiveStressTesting.ASTAction` The sampled action. """ s = tree[s] A_explored = s.a.keys() if len(A_explored) == 0.0: return ASTAction(self.env.action_space.sample()) UCB = self.getUCB(s) sigma_known = np.std([float(UCB[a]) for a in s.a.keys()]) A_pool = [] dist_pool = [] center = (self.env.action_space.low + self.env.action_space.high) / 2.0 for i in range(self.params.M): a = self.env.action_space.sample() A_pool.append(a) dist = self.getDistance(a, center) dist_pool.append(dist) sigma_pool = np.std(dist_pool) rho = sigma_known / sigma_pool BV_max = -np.inf a_best = None for y in A_pool: BV = self.getBV(y, rho, A_explored, UCB) if BV > BV_max: BV_max = BV a_best = y return ASTAction(a_best)
[docs] def getDistance(self, a, b): """Get the (L2) distance between two actions. Parameters ---------- a : :py:class:`numpy.ndarry` The first action. b : :py:class:`numpy.ndarry` The second action. Returns ---------- distance : float The L2 distance between a and b. """ return np.sqrt(np.sum((a - b)**2))
[docs] def getUCB(self, s): """Get the upper confidnece bound for the expected return for evary actions that has been explored at the state. Parameters ---------- s : :py:class:`ast_toolbox.MCTSdpw.StateNode` The state node in the searching tree Returns ---------- UCB : dict The dictionary containing the upper confidence bound for each explored action in the state node. """ UCB = dict() nS = s.n for a in s.a.keys(): UCB[a] = s.a[a].q + self.params.ec * np.sqrt(np.log(nS) / float(s.a[a].n)) return UCB
[docs] def getBV(self, y, rho, A, UCB): """Calculate the Blind Value for the candidate action y Parameters ---------- y : :py:class:`numpy.ndarry` The candidate action. rho : float The standard deviation ratio. A : list[:py:class:`ast_toolbox.mcts.AdaptiveStressTesting.ASTAction`] The list of the explored AST actions UCB : dict The dictionary containing the upper confidence bound for each explored action in the state node. Returns ---------- BV : float The blind value. """ BVs = [] for a in A: BV = rho * self.getDistance(a.action, y) + UCB[a] BVs.append(BV) return min(BVs)