2017-02-21 37 views
10

我正在嘗試更改Karpathy的代碼,以便它可以與softmax函數一起使用,以便我可以將它用於超過2個操作的遊戲。但是,我無法讓它工作。有人可以幫助我指出正確的方向嗎?謝謝。以下是我的嘗試。如何使softmax與政策漸變一起工作?

""" Trains an agent with (stochastic) Policy Gradients on Pong. Uses OpenAI Gym. """ 
import numpy as np 
import cPickle as pickle 
import gym 

# hyperparameters 
H = 100 # number of hidden layer neurons 
batch_size = 10 # every how many episodes to do a param update? 
learning_rate = 1e-4 
gamma = 0.9 # discount factor for reward 
decay_rate = 0.9 # decay factor for RMSProp leaky sum of grad^2 
resume = False # resume from previous checkpoint? 
render = False 
num_action = 2 

# model initialization 
D = 6 # input dimensionality: 80x80 grid 
if resume: 
    model = pickle.load(open('save.p', 'rb')) 
else: 
    model = {} 
    model['W1'] = np.random.randn(H,D)/np.sqrt(D) # "Xavier" initialization 
    model['W2'] = np.random.randn(num_action, H)/np.sqrt(H) 

grad_buffer = { k : np.zeros_like(v) for k,v in model.iteritems() } # update buffers that add up gradients over a batch 
rmsprop_cache = { k : np.zeros_like(v) for k,v in model.iteritems() } # rmsprop memory 

def sigmoid(x): 
    return 1.0/(1.0 + np.exp(-x)) # sigmoid "squashing" function to interval [0,1] 

def softmax(w, t = 1.0): 
    e = np.exp(np.array(w)/t) 
    dist = e/np.sum(e) 
    return dist 

def prepro(I): 
    """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """ 
    I = I[35:195] # crop 
    I = I[::2,::2,0] # downsample by factor of 2 
    I[I == 144] = 0 # erase background (background type 1) 
    I[I == 109] = 0 # erase background (background type 2) 
    I[I != 0] = 1 # everything else (paddles, ball) just set to 1 
    return I.astype(np.float).ravel() 

def discount_rewards(r): 
    """ take 1D float array of rewards and compute discounted reward """ 
    discounted_r = np.zeros_like(r) 
    running_add = 0 
    for t in reversed(xrange(0, r.size)): 
    if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!) 
    running_add = running_add * gamma + r[t] 
    discounted_r[t] = running_add 
    return discounted_r 

def policy_forward(x): 
    h = np.dot(model['W1'], x) 
    h[h<0] = 0 # ReLU nonlinearity 
    logp = np.dot(model['W2'], h) 
    p = softmax(logp) 
    return p, h # return probability of taking action 2, and hidden state 

def policy_backward(eph, epdlogp): 
    """ backward pass. (eph is array of intermediate hidden states) """ 
    # print eph.shape 
    # print epdlogp.shape 
    # print model['W2'].shape 
    # dW2 = np.dot(eph.T, epdlogp).ravel() 
    # dh = np.outer(epdlogp, model['W2']) 
    # dh[eph <= 0] = 0 # backpro prelu 
    # dW1 = np.dot(dh.T, epx) 
    # return {'W1':dW1, 'W2':dW2} 
    dW2 = np.dot(eph.T, epdlogp).T 
    # print dW2.shape 
    dh = np.dot(epdlogp, model['W2']) 
    # print dh.shape 
    dh[eph <= 0] = 0 # backpro prelu 
    dW1 = np.dot(dh.T, epx) 
    return {'W1':dW1, 'W2':dW2} 




env = gym.make("Acrobot-v1") 
observation = env.reset() 
prev_x = None # used in computing the difference frame 
xs,hs,dlogps,drs = [],[],[],[] 
running_reward = None 
reward_sum = 0 
episode_number = 0 
while True: 
    if render: env.render() 

    # preprocess the observation, set input to network to be difference image 
    cur_x = observation 
    x = cur_x - prev_x if prev_x is not None else np.zeros(D) 
    prev_x = cur_x 

    # forward the policy network and sample an action from the returned probability 
    aprob, h = policy_forward(x) 
    action = np.argmax(aprob) 
    if action == 1: 
    action = 2 
    # action = 2 if np.random.uniform() > aprob[1] else 0 
    # print aprob 

    # action = 2 if np.random.uniform() < aprob else 3 # roll the dice! 

    # record various intermediates (needed later for backprop) 
    xs.append(x) # observation 
    hs.append(h) # hidden state 

    # if action == 0: 
    # y = [1,0,0] 
    # elif action == 1: 
    # y = [0,1,0] 
    # else: 
    # y = [0,0,1] 


    y = [1,0] if action == 0 else [0,1] # a "fake label" 

    dlogps.append(aprob-y) # grad that encourages the action that was taken to be taken (see http://cs231n.github.io/neural-networks-2/#losses if confused) 

    # step the environment and get new measurements 
    observation, reward, done, info = env.step(action) 
    reward_sum += reward 

    drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action) 

    if done: # an episode finished 
    episode_number += 1 

    # stack together all inputs, hidden states, action gradients, and rewards for this episode 
    epx = np.vstack(xs) 
    eph = np.vstack(hs) 
    epdlogp = np.vstack(dlogps) 
    epr = np.vstack(drs) 
    xs,hs,dlogps,drs = [],[],[],[] # reset array memory 

    # compute the discounted reward backwards through time 
    discounted_epr = discount_rewards(epr) 
    # standardize the rewards to be unit normal (helps control the gradient estimator variance) 
    discounted_epr -= np.mean(discounted_epr) 
    discounted_epr /= np.std(discounted_epr) 

    epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.) 
    grad = policy_backward(eph, epdlogp) 
    for k in model: grad_buffer[k] += grad[k] # accumulate grad over batch 

    # perform rmsprop parameter update every batch_size episodes 
    if episode_number % batch_size == 0: 
     for k,v in model.iteritems(): 
     g = grad_buffer[k] # gradient 
     rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + (1 - decay_rate) * g**2 
     model[k] += learning_rate * g/(np.sqrt(rmsprop_cache[k]) + 1e-5) 
     grad_buffer[k] = np.zeros_like(v) # reset batch gradient buffer 

    # boring book-keeping 
    running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01 
    print 'resetting env. episode reward total was %f. running mean: %f' % (reward_sum, running_reward) 
    if episode_number % 100 == 0: pickle.dump(model, open('save.p', 'wb')) 
    reward_sum = 0 
    observation = env.reset() # reset env 
    prev_x = None 

調試時,此代碼運行到「南」問題,我無法弄清楚如何解決。

+0

調試時,該代碼也運行到「南」,我不知道爲什麼。 – user136266

+1

我可以建議在交叉驗證部分發布這個問題嗎?我認爲你有更高的機會可以在那裏獲得幫助http://stats.stackexchange.com/ –

+0

目前還不清楚* Karpathy Code *是什麼,以及像卡住的地方。但是,按照您的問題的標題,您需要將* softmax函數應用於神經元輸出*和*梯度以調整權重*。現在** NaN **是由於計算達到無窮大(*或大數*)。檢查你已經**正常化輸入**,**縮放神經元輸出-1到1 **。 – SACn

回答

2

我認爲您在評論中提到的NaN問題是由於您的Softmax功能。

Softmax計算指數函數,即exp(x),對於x的中等數值,它很容易超過單精度浮點數或雙精度浮點數的範圍。這會導致exp返回NaN

解決方案

SOFTMAX的數學形式是:

s[i] = exp(x[i])/(exp(x[0]) + exp(x[1]) + .. + exp(x[n-1])) 

我們可以把這個表達的任意值的分子和分母,說exp(a)而不影響結果。

s[i] = (exp(x[i])/exp(a))/((exp(x[0]) + exp(x[1]) + .. + exp(x[n-1])/exp(a))) 

s[i] = exp(x[i]-a)/(exp(x[0]-a) + exp(x[1]-a) + .. + exp(x[n-1]-a)) 

如果我們讓a = max(x)那麼所有指數將爲零或負值,因此沒有呼叫爲exp將返回NaN的。

我不使用Python或numpy的,但我想你可以定義SOFTMAX類似:

def softmax(w): 
    a = np.max(w) 
    e = np.exp(np.array(w) - a) 
    dist = e/np.sum(e) 
    return dist