1

更新1:修改了貪婪epsilon政策,因爲它在使epsilon數量非常少之前所花費的插曲次數非常少。我已經更新了代碼。爲什麼q-learning函數不能在openai山地車中收斂

的新問題是在充分訓練它不應該偏離太大,但它拿起錯誤的價值觀瞬間發散是小量變小


我一直在努力openai gym平臺相當一段時間,現在我的目標瞭解更多關於強化學習的內容。在堆棧溢出用戶@sajad的幫助下,我成功實施了具有優先級體驗重放(PER)的雙重深度學習(DQN)。在推車問題上,通過仔細的超參數調整獲得了非常好的成功率。這是迄今爲止我學到的最好的算法,但無論我做什麼,我似乎都無法獲得山地汽車問題上的這項工作,獎勵總是保持在-200的情節。我已經看過我的代碼,並從various tutorials我認爲我的記憶實施是正確的。

從基本DQN到PERQ的DQN算法都不起作用。

這將是有益的,如果我會得到一些幫助調試的代碼或可能會導致其不收斂

這裏的任何其它方式有變動是我的實現:所有的參數都具有普通的名稱

# implemented using sum_tree 

import os 
import random 

import gym 
import numpy as np 
import tensorflow as tf 
from memory import Memory 

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 
env = gym.make("MountainCar-v0") 
env.reset() 
model_save_path = "C:/Users/sanka/codes/mountain car openai/mc_save" 


class dqn(object): 
    def __init__(self): 
     self.flag = 0 
     self.batch_size = 64 
     self.episodes = 20000 
     self.input_size = env.observation_space.sample().size 
     self.output_size = env.action_space.n 
     self.gamma = 0.99 
     self.epsilon = 1.0 
     self.step = 0 
     self.learning_rate = 0.0001 
     self.lambda1 = 0.001 
     self.initial_epsilon = self.epsilon 
     self.final_epsilon = 0.01 
     self.weights = {} 
     self.biases = {} 
     self.target_weights = {} 
     self.target_biases = {} 
     self.create_nn() 
     self.create_training_network() 
     self.max_size = 10000 
     self.memory = Memory(size=self.max_size) 
     self.sess = tf.InteractiveSession() 
     self.sess.run(tf.global_variables_initializer()) 
     self.saver = tf.train.Saver() 

    def create_nn(self): 

     s1 = {1: [self.input_size, 30], 2: [30, 100], 3: [100, 30], 4: [30, self.output_size]} 
     s2 = {1: [30], 2: [100], 3: [30], 4: [self.output_size]} 
     for i in s1: 
      self.weights[i] = tf.Variable(tf.truncated_normal(s1[i]), name='w{0}'.format(i)) 
      self.biases[i] = tf.Variable(tf.truncated_normal(s2[i]), name='b{0}'.format(i)) 
      self.target_weights[i] = tf.Variable(tf.truncated_normal(s1[i]), name='tw{0}'.format(i)) 
      self.target_biases[i] = tf.Variable(tf.truncated_normal(s2[i]), name='tb{0}'.format(i)) 

    def feed_forward(self, z): 
     q = tf.nn.relu(tf.matmul(z, self.weights[1]) + self.biases[1]) 
     for i in range(2, len(self.weights), 1): 
      q = tf.nn.relu(tf.matmul(q, self.weights[i]) + self.biases[i]) 
     q = tf.matmul(q, self.weights[len(self.weights)]) + self.biases[len(self.biases)] 
     return q 

    def feed_forward_target(self, z): 
     q = tf.nn.relu(tf.matmul(z, self.target_weights[1]) + self.target_biases[1]) 
     for i in range(2, len(self.weights), 1): 
      q = tf.nn.relu(tf.matmul(q, self.target_weights[i]) + self.target_biases[i]) 
     q = tf.matmul(q, self.target_weights[len(self.weights)]) + self.target_biases[len(self.weights)] 
     return q 

    def create_training_network(self): 
     self.x = tf.placeholder(tf.float32, [None, self.input_size]) 
     self.y = tf.placeholder(tf.float32, [None]) 
     self.a = tf.placeholder(tf.float32, [None, self.output_size]) 
     self.q_value = self.feed_forward(self.x) 
     self.q_value_target = self.feed_forward_target(self.x) 
     self.output = tf.reduce_sum(tf.multiply(self.q_value, self.a), reduction_indices=1) 
     self.action = tf.argmax(self.q_value, 1) 
     self.loss = tf.reduce_mean(tf.square(self.output - self.y)) 
     self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.loss) 

    def append_to_memory(self, state, action, reward, next_state, done): 
     one_hot_action = np.zeros(self.output_size) 
     one_hot_action[action] = 1.0 
     prob = (abs(reward) + .01) ** 0.6 
     self.memory.append(prob, (state, one_hot_action, reward, next_state, done)) 
     if self.memory.current_size >= self.memory.size: 
      self.step += 1 
      # self.epsilon = self.final_epsilon + (self.initial_epsilon - self.final_epsilon) * np.exp(
      # -self.lambda1 * (self.step/200)) 
      self.epsilon = max(self.initial_epsilon - (self.step/200) * self.lambda1, self.final_epsilon) 
      if (self.flag == 0): 
       print("started training") 
       self.flag = 1 
      self.train() 

    def get_reward(self, q1, q2, reward, done): 
     if done: 
      return reward 
     else: 
      return reward + self.gamma * q2[np.argmax(q1)] 

    def train(self): 
     index, sample = self.memory.sample(self.batch_size) 
     train_x = [i[0] for i in sample] 
     action = [i[1] for i in sample] 
     reward = [i[2] for i in sample] 
     next_state = [i[3] for i in sample] 
     train_y = [] 
     q = self.sess.run(self.q_value, feed_dict={self.x: np.array(train_x)}) 
     q_1 = self.sess.run(self.q_value, feed_dict={self.x: np.array(next_state)}) 
     q_next = self.sess.run(self.q_value_target, feed_dict={self.x: np.array(next_state)}) 
     for i in range(len(reward)): 
      train_y.append(self.get_reward(q_1[i], q_next[i], reward[i], sample[i][4])) 
     train_y = np.array(train_y) 
     train_x = np.array(train_x) 
     action = np.array(action) 
     self.sess.run(self.optimizer, feed_dict={self.x: train_x, self.y: train_y, self.a: action}) 
     for i in range(self.batch_size): 
      error = abs(np.max(q[i]) - train_y[i]) 
      self.memory.update(index[i], (error + 0.01) ** 0.6) 
      # return loss 

    def copy_variables(self): 
     for i in range(1, len(self.weights) + 1, 1): 
      self.sess.run(self.target_weights[i].assign(self.weights[i])) 
      self.sess.run(self.target_biases[i].assign(self.biases[i])) 

    def save(self): 
     self.saver.save(self.sess, model_save_path) 
     print("model saved") 


def main(): 
    obj = dqn() 
    for e in range(obj.episodes): 
     p = env.reset() 
     for i in range(500): 
      # obj.step += 1 
      ac = obj.sess.run(obj.action, feed_dict={obj.x: np.array([p])})[0] 
      if np.random.rand() < obj.epsilon: 
       ac = random.randint(0, obj.output_size - 1) 

      obs, rew, done, _ = env.step(ac) 
      obj.append_to_memory(p, ac, rew, obs, done) 
      p = obs 
      if done: 
       break 
      if obj.step % 1000 == 0 and obj.flag == 1: 
       obj.copy_variables() 
     # print("episode {0} completed with loss: {1}".format(e, total_loss)) 

     if e % 100 == 0: 
      print("episodes {0} completed".format(e),) 
      av = [] 
      for f in range(10): 
       p = env.reset() 
       r = 0 
       for i in range(200): 
        ac = obj.sess.run(obj.action, feed_dict={obj.x: np.array([p])})[0] 
        p, rew, done, _ = env.step(ac) 
        r += rew 
        if done: 
         break 
       av.append(r) 
      print("average score is {0}".format(np.average(np.array(av)))) 
      obj.save() 


if __name__ == '__main__': 
    main() 

參考這裏是作爲單獨的模塊實現的存儲器的實現:

import numpy as np 
import random 


class Memory(object): 
    def __init__(self, size): 
     self.size = size 
     self.data = np.zeros(size, dtype=object) 
     self.tree = np.zeros(2 * size - 1, dtype=np.float32) 
     self.current_size = 0 
     self.last = 0 

    def append(self, p, data): 
     self.current_size = min(self.current_size + 1, self.size) 
     cur = self.last + self.size - 1 
     self.update_at_index(cur, p - self.tree[cur]) 
     self.data[self.last] = data 
     self.last += 1 
     if self.last >= self.size: 
      self.last = 0 

    def update(self, index, p): 
     self.update_at_index(index, p - self.tree[index]) 

    def update_at_index(self, index, change): 
     while (index >= 0): 
      self.tree[index] += change 
      index = (index - 1) // 2 

    def get(self, index, s): 
     left = index * 2 + 1 
     if (left >= self.size): 
      return (index, self.data[index + 1 - self.size]) 
     if (self.tree[left] >= s): 
      return self.get(left, s) 
     else: 
      right = left + 1 
      return self.get(right, s - self.tree[left]) 

    def sample(self, n): 
     av_sum = self.tree[0]/n 
     l = [] 
     m = [] 
     for i in range(n): 
      min_sum = av_sum * i 
      max_sum = av_sum * (i + 1) 
      s = random.uniform(min_sum, max_sum) 
      x = self.get(0, s) 
      l.append(x[0]) 
      m.append(x[1]) 
     return l, m 

由於事先

回答

1

我在一個連續版本的montain car上工作(現在在openai gym),並用DDPG解決了這個問題,在我的實驗中,我發現如果在少數第一集中沒有獲得獎勵,它會學習沒做什麼。所以這裏是一個探索問題,也許你可以讓它在開始學習之前爲某些情節做隨機動作。 或找到獎勵探索的方法。 (例如,當我從未見過的觀察結果給予獎勵時,它對我很有幫助)。

+0

是的,我確實讓它充分探索了最初的50集(因爲我的記憶是10000),然後我開始以指數速率降低epsilon並開始學習。在我最初的隨機抽樣嘗試中,它確實收斂了一點,但後來又陷入了陷阱。與PER它從來沒有收斂,即使甚至有點甚至4000集。我也嘗試降低epsilon下降的速度。但我會嘗試你的獎勵方法,看看它是否有效 –

+0

你是對的,這是勘探問題。所以我更新了我使用隨機值的次數。所以它開始收斂,但是在我的epsilon變得非常少之後,它又開始分化了。找不到解決方案 –