tensorflow - why doesn't the q-learning function converge in openai mountain car -
update 1: revised greedy epsilon policy number of episodes took before making epsilon less quantity less. have updated code.
the new problem after training should not deviate picks wrong values , instantly diverges epsilon becomes small
i have been working on openai gym platform quite sometime goal learn more reinforcement learning. have implemented double deep-q learning(dqn) prioritized experience replay(per) of stack overflow user @sajad. on cart-pole problem , received success rate careful hyper-parameter tuning. far best algorithm have learned whatever cannot seem work on mountain car problem reward keeps on -200 episodes. have looked code , various tutorials think memory implementation correct.
neither of algorithms basic dqn dqn per seems work.
it helpful if in debugging code or other implementation changes might causing not converge
here implementation: parameters have usual names
# implemented using sum_tree import os import random import gym import numpy np import tensorflow tf memory import memory os.environ['tf_cpp_min_log_level'] = '2' env = gym.make("mountaincar-v0") env.reset() model_save_path = "c:/users/sanka/codes/mountain car openai/mc_save" class dqn(object): def __init__(self): self.flag = 0 self.batch_size = 64 self.episodes = 20000 self.input_size = env.observation_space.sample().size self.output_size = env.action_space.n self.gamma = 0.99 self.epsilon = 1.0 self.step = 0 self.learning_rate = 0.0001 self.lambda1 = 0.001 self.initial_epsilon = self.epsilon self.final_epsilon = 0.01 self.weights = {} self.biases = {} self.target_weights = {} self.target_biases = {} self.create_nn() self.create_training_network() self.max_size = 10000 self.memory = memory(size=self.max_size) self.sess = tf.interactivesession() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.saver() def create_nn(self): s1 = {1: [self.input_size, 30], 2: [30, 100], 3: [100, 30], 4: [30, self.output_size]} s2 = {1: [30], 2: [100], 3: [30], 4: [self.output_size]} in s1: self.weights[i] = tf.variable(tf.truncated_normal(s1[i]), name='w{0}'.format(i)) self.biases[i] = tf.variable(tf.truncated_normal(s2[i]), name='b{0}'.format(i)) self.target_weights[i] = tf.variable(tf.truncated_normal(s1[i]), name='tw{0}'.format(i)) self.target_biases[i] = tf.variable(tf.truncated_normal(s2[i]), name='tb{0}'.format(i)) def feed_forward(self, z): q = tf.nn.relu(tf.matmul(z, self.weights[1]) + self.biases[1]) in range(2, len(self.weights), 1): q = tf.nn.relu(tf.matmul(q, self.weights[i]) + self.biases[i]) q = tf.matmul(q, self.weights[len(self.weights)]) + self.biases[len(self.biases)] return q def feed_forward_target(self, z): q = tf.nn.relu(tf.matmul(z, self.target_weights[1]) + self.target_biases[1]) in range(2, len(self.weights), 1): q = tf.nn.relu(tf.matmul(q, self.target_weights[i]) + self.target_biases[i]) q = tf.matmul(q, self.target_weights[len(self.weights)]) + self.target_biases[len(self.weights)] return q def create_training_network(self): self.x = tf.placeholder(tf.float32, [none, self.input_size]) self.y = tf.placeholder(tf.float32, [none]) self.a = tf.placeholder(tf.float32, [none, self.output_size]) self.q_value = self.feed_forward(self.x) self.q_value_target = self.feed_forward_target(self.x) self.output = tf.reduce_sum(tf.multiply(self.q_value, self.a), reduction_indices=1) self.action = tf.argmax(self.q_value, 1) self.loss = tf.reduce_mean(tf.square(self.output - self.y)) self.optimizer = tf.train.adamoptimizer(learning_rate=self.learning_rate).minimize(self.loss) def append_to_memory(self, state, action, reward, next_state, done): one_hot_action = np.zeros(self.output_size) one_hot_action[action] = 1.0 prob = (abs(reward) + .01) ** 0.6 self.memory.append(prob, (state, one_hot_action, reward, next_state, done)) if self.memory.current_size >= self.memory.size: self.step += 1 # self.epsilon = self.final_epsilon + (self.initial_epsilon - self.final_epsilon) * np.exp( # -self.lambda1 * (self.step / 200)) self.epsilon = max(self.initial_epsilon - (self.step / 200) * self.lambda1, self.final_epsilon) if (self.flag == 0): print("started training") self.flag = 1 self.train() def get_reward(self, q1, q2, reward, done): if done: return reward else: return reward + self.gamma * q2[np.argmax(q1)] def train(self): index, sample = self.memory.sample(self.batch_size) train_x = [i[0] in sample] action = [i[1] in sample] reward = [i[2] in sample] next_state = [i[3] in sample] train_y = [] q = self.sess.run(self.q_value, feed_dict={self.x: np.array(train_x)}) q_1 = self.sess.run(self.q_value, feed_dict={self.x: np.array(next_state)}) q_next = self.sess.run(self.q_value_target, feed_dict={self.x: np.array(next_state)}) in range(len(reward)): train_y.append(self.get_reward(q_1[i], q_next[i], reward[i], sample[i][4])) train_y = np.array(train_y) train_x = np.array(train_x) action = np.array(action) self.sess.run(self.optimizer, feed_dict={self.x: train_x, self.y: train_y, self.a: action}) in range(self.batch_size): error = abs(np.max(q[i]) - train_y[i]) self.memory.update(index[i], (error + 0.01) ** 0.6) # return loss def copy_variables(self): in range(1, len(self.weights) + 1, 1): self.sess.run(self.target_weights[i].assign(self.weights[i])) self.sess.run(self.target_biases[i].assign(self.biases[i])) def save(self): self.saver.save(self.sess, model_save_path) print("model saved") def main(): obj = dqn() e in range(obj.episodes): p = env.reset() in range(500): # obj.step += 1 ac = obj.sess.run(obj.action, feed_dict={obj.x: np.array([p])})[0] if np.random.rand() < obj.epsilon: ac = random.randint(0, obj.output_size - 1) obs, rew, done, _ = env.step(ac) obj.append_to_memory(p, ac, rew, obs, done) p = obs if done: break if obj.step % 1000 == 0 , obj.flag == 1: obj.copy_variables() # print("episode {0} completed loss: {1}".format(e, total_loss)) if e % 100 == 0: print("episodes {0} completed".format(e), ) av = [] f in range(10): p = env.reset() r = 0 in range(200): ac = obj.sess.run(obj.action, feed_dict={obj.x: np.array([p])})[0] p, rew, done, _ = env.step(ac) r += rew if done: break av.append(r) print("average score {0}".format(np.average(np.array(av)))) obj.save() if __name__ == '__main__': main()
for reference here implementation of memory implemented separate module:
import numpy np import random class memory(object): def __init__(self, size): self.size = size self.data = np.zeros(size, dtype=object) self.tree = np.zeros(2 * size - 1, dtype=np.float32) self.current_size = 0 self.last = 0 def append(self, p, data): self.current_size = min(self.current_size + 1, self.size) cur = self.last + self.size - 1 self.update_at_index(cur, p - self.tree[cur]) self.data[self.last] = data self.last += 1 if self.last >= self.size: self.last = 0 def update(self, index, p): self.update_at_index(index, p - self.tree[index]) def update_at_index(self, index, change): while (index >= 0): self.tree[index] += change index = (index - 1) // 2 def get(self, index, s): left = index * 2 + 1 if (left >= self.size): return (index, self.data[index + 1 - self.size]) if (self.tree[left] >= s): return self.get(left, s) else: right = left + 1 return self.get(right, s - self.tree[left]) def sample(self, n): av_sum = self.tree[0] / n l = [] m = [] in range(n): min_sum = av_sum * max_sum = av_sum * (i + 1) s = random.uniform(min_sum, max_sum) x = self.get(0, s) l.append(x[0]) m.append(x[1]) return l, m
thanks in advance
i worked on continuous version of montain car (the 1 in openai gym) , solved ddpg, , during experiments found if no reward achieved during few first episodes, learns nothing. here exploration issue, maybe let random actions episodes, before starting learn. or find way reward exploration. (for example worked great me when gave reward never seen observations).
Comments
Post a Comment