Source code for ast_toolbox.mcts.AdaptiveStressTestingBlindValue

import numpy as np

from ast_toolbox.mcts.AdaptiveStressTesting import AdaptiveStressTest
from ast_toolbox.mcts.AdaptiveStressTesting import ASTAction


[docs]class AdaptiveStressTestBV(AdaptiveStressTest):
    """The AST wrapper for MCTS using the Blind Value exploration [1]_.

    Parameters
    ----------
    kwargs :
        Keyword arguments passed to `ast_toolbox.mcts.AdaptiveStressTesting.AdaptiveStressTest`

    References
    ----------
    .. [1] Couetoux, Adrien, Hassen Doghmen, and Olivier Teytaud. "Improving the exploration in upper confidence trees."
     International Conference on Learning and Intelligent Optimization. Springer, Berlin, Heidelberg, 2012.
    """

    def __init__(self, **kwargs):
        super(AdaptiveStressTestBV, self).__init__(**kwargs)

[docs]    def explore_action(self, s, tree):
        """Sample an action for the exploration using Blind Value.

        Parameters
        ----------
        s : :py:class:`ast_toolbox.mcts.AdaptiveStressTesting.ASTState`
            The current state.
        tree : dict
            The searching tree.

        Returns
        ----------
        action : :py:class:`ast_toolbox.mcts.AdaptiveStressTesting.ASTAction`
            The sampled action.
        """
        s = tree[s]
        A_explored = s.a.keys()
        if len(A_explored) == 0.0:
            return ASTAction(self.env.action_space.sample())
        UCB = self.getUCB(s)
        sigma_known = np.std([float(UCB[a]) for a in s.a.keys()])

        A_pool = []
        dist_pool = []
        center = (self.env.action_space.low + self.env.action_space.high) / 2.0
        for i in range(self.params.M):
            a = self.env.action_space.sample()
            A_pool.append(a)
            dist = self.getDistance(a, center)
            dist_pool.append(dist)
        sigma_pool = np.std(dist_pool)

        rho = sigma_known / sigma_pool

        BV_max = -np.inf
        a_best = None
        for y in A_pool:
            BV = self.getBV(y, rho, A_explored, UCB)
            if BV > BV_max:
                BV_max = BV
                a_best = y
        return ASTAction(a_best)

[docs]    def getDistance(self, a, b):
        """Get the (L2) distance between two actions.

        Parameters
        ----------
        a : :py:class:`numpy.ndarry`
            The first action.
        b : :py:class:`numpy.ndarry`
            The second action.

        Returns
        ----------
        distance : float
            The L2 distance between a and b.
        """
        return np.sqrt(np.sum((a - b)**2))

[docs]    def getUCB(self, s):
        """Get the upper confidnece bound for the expected return for evary actions that has been explored at the state.

        Parameters
        ----------
        s : :py:class:`ast_toolbox.MCTSdpw.StateNode`
            The state node in the searching tree

        Returns
        ----------
        UCB : dict
            The dictionary containing the upper confidence bound for each explored action in the state node.
        """
        UCB = dict()
        nS = s.n
        for a in s.a.keys():
            UCB[a] = s.a[a].q + self.params.ec * np.sqrt(np.log(nS) / float(s.a[a].n))
        return UCB

[docs]    def getBV(self, y, rho, A, UCB):
        """Calculate the Blind Value for the candidate action y

        Parameters
        ----------
        y : :py:class:`numpy.ndarry`
            The candidate action.
        rho : float
            The standard deviation ratio.
        A : list[:py:class:`ast_toolbox.mcts.AdaptiveStressTesting.ASTAction`]
            The list of the explored AST actions
        UCB : dict
            The dictionary containing the upper confidence bound for each explored action in the state node.

        Returns
        ----------
        BV : float
            The blind value.
        """
        BVs = []
        for a in A:
            BV = rho * self.getDistance(a.action, y) + UCB[a]
            BVs.append(BV)
        return min(BVs)