tensorflow - why doesn't the q-learning function converge in openai mountain car -


update 1: revised greedy epsilon policy number of episodes took before making epsilon less quantity less. have updated code.

the new problem after training should not deviate picks wrong values , instantly diverges epsilon becomes small


i have been working on openai gym platform quite sometime goal learn more reinforcement learning. have implemented double deep-q learning(dqn) prioritized experience replay(per) of stack overflow user @sajad. on cart-pole problem , received success rate careful hyper-parameter tuning. far best algorithm have learned whatever cannot seem work on mountain car problem reward keeps on -200 episodes. have looked code , various tutorials think memory implementation correct.

neither of algorithms basic dqn dqn per seems work.

it helpful if in debugging code or other implementation changes might causing not converge

here implementation: parameters have usual names

    # implemented using sum_tree  import os import random  import gym import numpy np import tensorflow tf memory import memory  os.environ['tf_cpp_min_log_level'] = '2' env = gym.make("mountaincar-v0") env.reset() model_save_path = "c:/users/sanka/codes/mountain car openai/mc_save"   class dqn(object):     def __init__(self):         self.flag = 0         self.batch_size = 64         self.episodes = 20000         self.input_size = env.observation_space.sample().size         self.output_size = env.action_space.n         self.gamma = 0.99         self.epsilon = 1.0         self.step = 0         self.learning_rate = 0.0001         self.lambda1 = 0.001         self.initial_epsilon = self.epsilon         self.final_epsilon = 0.01         self.weights = {}         self.biases = {}         self.target_weights = {}         self.target_biases = {}         self.create_nn()         self.create_training_network()         self.max_size = 10000         self.memory = memory(size=self.max_size)         self.sess = tf.interactivesession()         self.sess.run(tf.global_variables_initializer())         self.saver = tf.train.saver()      def create_nn(self):          s1 = {1: [self.input_size, 30], 2: [30, 100], 3: [100, 30], 4: [30, self.output_size]}         s2 = {1: [30], 2: [100], 3: [30], 4: [self.output_size]}         in s1:             self.weights[i] = tf.variable(tf.truncated_normal(s1[i]), name='w{0}'.format(i))             self.biases[i] = tf.variable(tf.truncated_normal(s2[i]), name='b{0}'.format(i))             self.target_weights[i] = tf.variable(tf.truncated_normal(s1[i]), name='tw{0}'.format(i))             self.target_biases[i] = tf.variable(tf.truncated_normal(s2[i]), name='tb{0}'.format(i))      def feed_forward(self, z):         q = tf.nn.relu(tf.matmul(z, self.weights[1]) + self.biases[1])         in range(2, len(self.weights), 1):             q = tf.nn.relu(tf.matmul(q, self.weights[i]) + self.biases[i])         q = tf.matmul(q, self.weights[len(self.weights)]) + self.biases[len(self.biases)]         return q      def feed_forward_target(self, z):         q = tf.nn.relu(tf.matmul(z, self.target_weights[1]) + self.target_biases[1])         in range(2, len(self.weights), 1):             q = tf.nn.relu(tf.matmul(q, self.target_weights[i]) + self.target_biases[i])         q = tf.matmul(q, self.target_weights[len(self.weights)]) + self.target_biases[len(self.weights)]         return q      def create_training_network(self):         self.x = tf.placeholder(tf.float32, [none, self.input_size])         self.y = tf.placeholder(tf.float32, [none])         self.a = tf.placeholder(tf.float32, [none, self.output_size])         self.q_value = self.feed_forward(self.x)         self.q_value_target = self.feed_forward_target(self.x)         self.output = tf.reduce_sum(tf.multiply(self.q_value, self.a), reduction_indices=1)         self.action = tf.argmax(self.q_value, 1)         self.loss = tf.reduce_mean(tf.square(self.output - self.y))         self.optimizer = tf.train.adamoptimizer(learning_rate=self.learning_rate).minimize(self.loss)      def append_to_memory(self, state, action, reward, next_state, done):         one_hot_action = np.zeros(self.output_size)         one_hot_action[action] = 1.0         prob = (abs(reward) + .01) ** 0.6         self.memory.append(prob, (state, one_hot_action, reward, next_state, done))         if self.memory.current_size >= self.memory.size:             self.step += 1             # self.epsilon = self.final_epsilon + (self.initial_epsilon - self.final_epsilon) * np.exp(             #    -self.lambda1 * (self.step / 200))             self.epsilon = max(self.initial_epsilon - (self.step / 200) * self.lambda1, self.final_epsilon)             if (self.flag == 0):                 print("started training")                 self.flag = 1             self.train()      def get_reward(self, q1, q2, reward, done):         if done:             return reward         else:             return reward + self.gamma * q2[np.argmax(q1)]      def train(self):         index, sample = self.memory.sample(self.batch_size)         train_x = [i[0] in sample]         action = [i[1] in sample]         reward = [i[2] in sample]         next_state = [i[3] in sample]         train_y = []         q = self.sess.run(self.q_value, feed_dict={self.x: np.array(train_x)})         q_1 = self.sess.run(self.q_value, feed_dict={self.x: np.array(next_state)})         q_next = self.sess.run(self.q_value_target, feed_dict={self.x: np.array(next_state)})         in range(len(reward)):             train_y.append(self.get_reward(q_1[i], q_next[i], reward[i], sample[i][4]))         train_y = np.array(train_y)         train_x = np.array(train_x)         action = np.array(action)         self.sess.run(self.optimizer, feed_dict={self.x: train_x, self.y: train_y, self.a: action})         in range(self.batch_size):             error = abs(np.max(q[i]) - train_y[i])             self.memory.update(index[i], (error + 0.01) ** 0.6)             # return loss      def copy_variables(self):         in range(1, len(self.weights) + 1, 1):             self.sess.run(self.target_weights[i].assign(self.weights[i]))             self.sess.run(self.target_biases[i].assign(self.biases[i]))      def save(self):         self.saver.save(self.sess, model_save_path)         print("model saved")   def main():     obj = dqn()     e in range(obj.episodes):         p = env.reset()         in range(500):             # obj.step += 1             ac = obj.sess.run(obj.action, feed_dict={obj.x: np.array([p])})[0]             if np.random.rand() < obj.epsilon:                 ac = random.randint(0, obj.output_size - 1)              obs, rew, done, _ = env.step(ac)             obj.append_to_memory(p, ac, rew, obs, done)             p = obs             if done:                 break             if obj.step % 1000 == 0 , obj.flag == 1:                 obj.copy_variables()         # print("episode {0} completed loss: {1}".format(e, total_loss))          if e % 100 == 0:             print("episodes {0} completed".format(e), )             av = []             f in range(10):                 p = env.reset()                 r = 0                 in range(200):                     ac = obj.sess.run(obj.action, feed_dict={obj.x: np.array([p])})[0]                     p, rew, done, _ = env.step(ac)                     r += rew                     if done:                         break                 av.append(r)             print("average score {0}".format(np.average(np.array(av))))             obj.save()   if __name__ == '__main__':     main() 

for reference here implementation of memory implemented separate module:

import numpy np import random   class memory(object):     def __init__(self, size):         self.size = size         self.data = np.zeros(size, dtype=object)         self.tree = np.zeros(2 * size - 1, dtype=np.float32)         self.current_size = 0         self.last = 0      def append(self, p, data):         self.current_size = min(self.current_size + 1, self.size)         cur = self.last + self.size - 1         self.update_at_index(cur, p - self.tree[cur])         self.data[self.last] = data         self.last += 1         if self.last >= self.size:             self.last = 0      def update(self, index, p):         self.update_at_index(index, p - self.tree[index])      def update_at_index(self, index, change):         while (index >= 0):             self.tree[index] += change             index = (index - 1) // 2      def get(self, index, s):         left = index * 2 + 1         if (left >= self.size):             return (index, self.data[index + 1 - self.size])         if (self.tree[left] >= s):             return self.get(left, s)         else:             right = left + 1             return self.get(right, s - self.tree[left])      def sample(self, n):         av_sum = self.tree[0] / n         l = []         m = []         in range(n):             min_sum = av_sum *             max_sum = av_sum * (i + 1)             s = random.uniform(min_sum, max_sum)             x = self.get(0, s)             l.append(x[0])             m.append(x[1])         return l, m 

thanks in advance

i worked on continuous version of montain car (the 1 in openai gym) , solved ddpg, , during experiments found if no reward achieved during few first episodes, learns nothing. here exploration issue, maybe let random actions episodes, before starting learn. or find way reward exploration. (for example worked great me when gave reward never seen observations).


Comments

Popular posts from this blog

networking - Vagrant-provisioned VirtualBox VM is not reachable from Ubuntu host -

c# - ASP.NET Core - There is already an object named 'AspNetRoles' in the database -

ruby on rails - ArgumentError: Missing host to link to! Please provide the :host parameter, set default_url_options[:host], or set :only_path to true -