이번 포스팅에서는 Partial Observability 상태에서 Reinforcement Learning 을 하기위해 RNN 을 사용하는 방법입니다.
지금까지의 Reinforcement Learning 포스팅에서는 에이전트가 환경의 모든 정보를 가지고 있었습니다. 쉽게 설명하자면 스타크래프트에서 show me the money 치트를 사용한 것과 같은 환경을 준 것입니다. 이렇게 에이전트에 최선의 행동을 선택하기위해 필요한 모든 정보를 에이전트에서 전달하는 환경을 Markov Decision Processes 라 합니다.
반면에 에이전트에서 제한된 상태를 전달해주는 환경을 Partially Observable Markov Decision Processes 라 합니다.사실 실제 환경은 대부분 이렇게 부분적인 환경만을 관찰할 수 있지요. 이런 부분적인 환경만을 관찰할 수 있을 때 에이전트에 최선의 행동을 선택하기 위해서는 시간을 포함한 다양한 정보를 주는 방법을 통해 해결할 수 있습니다. 예를 들어 상대방의 진영에서 드랍쉽이 날라가는걸 관찰한 후 드랍쉽이 시야가 닿지 않는 곳으로 가버렸다면 드랍쉽이 환경에서 완전히 사라진 것이 아니라 드랍쉽의 속도를 생각하여 언제쯤 내 진영에 도착할지 추측할 수 있을 것입니다.
이런 시간적으로 연속된 정보를 처리하기 위해서는 RNN 방식을 사용하는 것이 효과적일 것입니다. 그리고 RNN 을 사용하는 Q-Networks 를 Deep Recurrent Q-Learning 라 부릅니다. https://arxiv.org/abs/1507.06527
다음으로 DQN 포스팅에서 experience buffer 을 사용했는데 이 버퍼에 경험을 저장하는 방법을 조정할 필요가 있습니다. 이 포스팅에서는 신경망이 연속된 정보를 처리하기 원하기 때문에 버퍼에서 무작위 데이터를 꺼내 사용할 수 없습니다. 대신 무작위 데이터를 꺼내오지만 무작위 데이터로부터 8 step 의 기록을 붙여서 가져올 것이빈다.
마지막으로 이 포스팅에서는 https://arxiv.org/abs/1609.05521 이 논문에서 사용한 기법을 사용했습니다. 이 논문에서는 에이전트를 학습할 때 모든 gradient를 back propagation 에 사용하는 대신에 gradient 의 절반을 나누어 마지막 절반만을 사용했고 이를 통해 더 의미있는 데이터만으로 학습시킬 수 있습니다.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 | import numpy as np import random import tensorflow as tf import matplotlib.pyplot as plt import scipy.misc import os import csv import itertools import tensorflow.contrib.slim as slim from helper import * from gridworld import gameEnv env = gameEnv(partial=True,size=9) class Qnetwork(): def __init__(self, h_size, rnn_cell, myScope): # The network recieves a frame from the game, flattened into an array. # It then resizes it and processes it through four convolutional layers. self.scalarInput = tf.placeholder(shape=[None, 21168], dtype=tf.float32) self.imageIn = tf.reshape(self.scalarInput, shape=[-1, 84, 84, 3]) self.conv1 = slim.convolution2d( \ inputs=self.imageIn, num_outputs=32, \ kernel_size=[8, 8], stride=[4, 4], padding='VALID', \ biases_initializer=None, scope=myScope + '_conv1') self.conv2 = slim.convolution2d( \ inputs=self.conv1, num_outputs=64, \ kernel_size=[4, 4], stride=[2, 2], padding='VALID', \ biases_initializer=None, scope=myScope + '_conv2') self.conv3 = slim.convolution2d( \ inputs=self.conv2, num_outputs=64, \ kernel_size=[3, 3], stride=[1, 1], padding='VALID', \ biases_initializer=None, scope=myScope + '_conv3') self.conv4 = slim.convolution2d( \ inputs=self.conv3, num_outputs=h_size, \ kernel_size=[7, 7], stride=[1, 1], padding='VALID', \ biases_initializer=None, scope=myScope + '_conv4') self.trainLength = tf.placeholder(dtype=tf.int32) # We take the output from the final convolutional layer and send it to a recurrent layer. # The input must be reshaped into [batch x trace x units] for rnn processing, # and then returned to [batch x units] when sent through the upper levles. self.batch_size = tf.placeholder(dtype=tf.int32) self.convFlat = tf.reshape(slim.flatten(self.conv4), [self.batch_size, self.trainLength, h_size]) self.state_in = cell.zero_state(self.batch_size, tf.float32) self.rnn, self.rnn_state = tf.nn.dynamic_rnn( \ inputs=self.convFlat, cell=rnn_cell, dtype=tf.float32, initial_state=self.state_in, scope=myScope + '_rnn') self.rnn = tf.reshape(self.rnn, shape=[-1, h_size]) # The output from the recurrent player is then split into separate Value and Advantage streams self.streamA, self.streamV = tf.split(self.rnn, 2, 1) self.AW = tf.Variable(tf.random_normal([h_size // 2, 4])) self.VW = tf.Variable(tf.random_normal([h_size // 2, 1])) self.Advantage = tf.matmul(self.streamA, self.AW) self.Value = tf.matmul(self.streamV, self.VW) self.salience = tf.gradients(self.Advantage, self.imageIn) # Then combine them together to get our final Q-values. self.Qout = self.Value + tf.subtract(self.Advantage, tf.reduce_mean(self.Advantage, axis=1, keep_dims=True)) self.predict = tf.argmax(self.Qout, 1) # Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values. self.targetQ = tf.placeholder(shape=[None], dtype=tf.float32) self.actions = tf.placeholder(shape=[None], dtype=tf.int32) self.actions_onehot = tf.one_hot(self.actions, 4, dtype=tf.float32) self.Q = tf.reduce_sum(tf.multiply(self.Qout, self.actions_onehot), axis=1) self.td_error = tf.square(self.targetQ - self.Q) # In order to only propogate accurate gradients through the network, we will mask the first # half of the losses for each trace as per Lample & Chatlot 2016 self.maskA = tf.zeros([self.batch_size, self.trainLength // 2]) self.maskB = tf.ones([self.batch_size, self.trainLength // 2]) self.mask = tf.concat([self.maskA, self.maskB], 1) self.mask = tf.reshape(self.mask, [-1]) self.loss = tf.reduce_mean(self.td_error * self.mask) self.trainer = tf.train.AdamOptimizer(learning_rate=0.0001) self.updateModel = self.trainer.minimize(self.loss) class experience_buffer(): def __init__(self, buffer_size=1000): self.buffer = [] self.buffer_size = buffer_size def add(self, experience): if len(self.buffer) + 1 >= self.buffer_size: self.buffer[0:(1 + len(self.buffer)) - self.buffer_size] = [] self.buffer.append(experience) def sample(self, batch_size, trace_length): sampled_episodes = random.sample(self.buffer, batch_size) sampledTraces = [] for episode in sampled_episodes: episodeList = list(episode) point = np.random.randint(0, len(episodeList) + 1 - trace_length) sampledTraces.append(episodeList[point:point + trace_length]) sampledTraces = np.array(sampledTraces) return np.reshape(sampledTraces, [batch_size * trace_length, 5]) ### Training the network #Setting the training parameters batch_size = 4 #How many experience traces to use for each training step. trace_length = 8 #How long each experience trace will be when training update_freq = 5 #How often to perform a training step. y = .99 #Discount factor on the target Q-values startE = 1 #Starting chance of random action endE = 0.1 #Final chance of random action anneling_steps = 10000 #How many steps of training to reduce startE to endE. num_episodes = 10000 #How many episodes of game environment to train network with. pre_train_steps = 10000 #How many steps of random actions before training begins. load_model = False #Whether to load a saved model. path = "./drqn" #The path to save our model to. h_size = 512 #The size of the final convolutional layer before splitting it into Advantage and Value streams. max_epLength = 50 #The max allowed length of our episode. time_per_step = 1 #Length of each step used in gif creation summaryLength = 100 #Number of epidoes to periodically save for analysis tau = 0.001 tf.reset_default_graph() # We define the cells for the primary and target q-networks cell = tf.contrib.rnn.BasicLSTMCell(num_units=h_size, state_is_tuple=True) cellT = tf.contrib.rnn.BasicLSTMCell(num_units=h_size, state_is_tuple=True) mainQN = Qnetwork(h_size, cell, 'main') targetQN = Qnetwork(h_size, cellT, 'target') init = tf.global_variables_initializer() saver = tf.train.Saver(max_to_keep=5) trainables = tf.trainable_variables() targetOps = updateTargetGraph(trainables, tau) myBuffer = experience_buffer() # Set the rate of random action decrease. e = startE stepDrop = (startE - endE) / anneling_steps # create lists to contain total rewards and steps per episode jList = [] rList = [] total_steps = 0 # Make a path for our model to be saved in. if not os.path.exists(path): os.makedirs(path) ##Write the first line of the master log-file for the Control Center with open('./Center/log.csv', 'w') as myfile: wr = csv.writer(myfile, quoting=csv.QUOTE_ALL) wr.writerow(['Episode', 'Length', 'Reward', 'IMG', 'LOG', 'SAL']) with tf.Session() as sess: if load_model == True: print('Loading Model...') ckpt = tf.train.get_checkpoint_state(path) saver.restore(sess, ckpt.model_checkpoint_path) sess.run(init) updateTarget(targetOps, sess) # Set the target network to be equal to the primary network. for i in range(num_episodes): episodeBuffer = [] # Reset environment and get first new observation sP = env.reset() s = processState(sP) d = False rAll = 0 j = 0 state = (np.zeros([1, h_size]), np.zeros([1, h_size])) # Reset the recurrent layer's hidden state # The Q-Network while j < max_epLength: j += 1 # Choose an action by greedily (with e chance of random action) from the Q-network if np.random.rand(1) < e or total_steps < pre_train_steps: state1 = sess.run(mainQN.rnn_state, \ feed_dict={mainQN.scalarInput: [s / 255.0], mainQN.trainLength: 1, mainQN.state_in: state, mainQN.batch_size: 1}) a = np.random.randint(0, 4) else: a, state1 = sess.run([mainQN.predict, mainQN.rnn_state], \ feed_dict={mainQN.scalarInput: [s / 255.0], mainQN.trainLength: 1, mainQN.state_in: state, mainQN.batch_size: 1}) a = a[0] s1P, r, d = env.step(a) s1 = processState(s1P) total_steps += 1 episodeBuffer.append(np.reshape(np.array([s, a, r, s1, d]), [1, 5])) if total_steps > pre_train_steps: if e > endE: e -= stepDrop if total_steps % (update_freq) == 0: updateTarget(targetOps, sess) # Reset the recurrent layer's hidden state state_train = (np.zeros([batch_size, h_size]), np.zeros([batch_size, h_size])) trainBatch = myBuffer.sample(batch_size, trace_length) # Get a random batch of experiences. # Below we perform the Double-DQN update to the target Q-values Q1 = sess.run(mainQN.predict, feed_dict={ \ mainQN.scalarInput: np.vstack(trainBatch[:, 3] / 255.0), \ mainQN.trainLength: trace_length, mainQN.state_in: state_train, mainQN.batch_size: batch_size}) Q2 = sess.run(targetQN.Qout, feed_dict={ \ targetQN.scalarInput: np.vstack(trainBatch[:, 3] / 255.0), \ targetQN.trainLength: trace_length, targetQN.state_in: state_train, targetQN.batch_size: batch_size}) end_multiplier = -(trainBatch[:, 4] - 1) doubleQ = Q2[range(batch_size * trace_length), Q1] targetQ = trainBatch[:, 2] + (y * doubleQ * end_multiplier) # Update the network with our target values. sess.run(mainQN.updateModel, \ feed_dict={mainQN.scalarInput: np.vstack(trainBatch[:, 0] / 255.0), mainQN.targetQ: targetQ, \ mainQN.actions: trainBatch[:, 1], mainQN.trainLength: trace_length, \ mainQN.state_in: state_train, mainQN.batch_size: batch_size}) rAll += r s = s1 sP = s1P state = state1 if d == True: break # Add the episode to the experience buffer bufferArray = np.array(episodeBuffer) episodeBuffer = zip(bufferArray) myBuffer.add(bufferArray) jList.append(j) rList.append(rAll) # Periodically save the model. if i % 1000 == 0 and i != 0: saver.save(sess, path + '/model-' + str(i) + '.cptk') print("Saved Model") if len(rList) % summaryLength == 0 and len(rList) != 0: print(total_steps, np.mean(rList[-summaryLength:]), e) # saveToCenter(i, rList, jList, np.reshape(np.array(episodeBuffer), [sum(1 for _ in episodeBuffer), 5]), \ # summaryLength, h_size, sess, mainQN, time_per_step) saver.save(sess, path + '/model-' + str(i) + '.cptk') ### Testing the network e = 0.01 #The chance of chosing a random action num_episodes = 10000 #How many episodes of game environment to train network with. load_model = True #Whether to load a saved model. path = "./drqn" #The path to save/load our model to/from. h_size = 512 #The size of the final convolutional layer before splitting it into Advantage and Value streams. h_size = 512 #The size of the final convolutional layer before splitting it into Advantage and Value streams. max_epLength = 50 #The max allowed length of our episode. time_per_step = 1 #Length of each step used in gif creation summaryLength = 100 #Number of epidoes to periodically save for analysis tf.reset_default_graph() cell = tf.contrib.rnn.BasicLSTMCell(num_units=h_size, state_is_tuple=True) cellT = tf.contrib.rnn.BasicLSTMCell(num_units=h_size, state_is_tuple=True) mainQN = Qnetwork(h_size, cell, 'main') targetQN = Qnetwork(h_size, cellT, 'target') init = tf.global_variables_initializer() saver = tf.train.Saver(max_to_keep=2) # create lists to contain total rewards and steps per episode jList = [] rList = [] total_steps = 0 # Make a path for our model to be saved in. if not os.path.exists(path): os.makedirs(path) ##Write the first line of the master log-file for the Control Center with open('./Center/log.csv', 'w') as myfile: wr = csv.writer(myfile, quoting=csv.QUOTE_ALL) wr.writerow(['Episode', 'Length', 'Reward', 'IMG', 'LOG', 'SAL']) # wr = csv.writer(open('./Center/log.csv', 'a'), quoting=csv.QUOTE_ALL) with tf.Session() as sess: if load_model == True: print('Loading Model...') ckpt = tf.train.get_checkpoint_state(path) saver.restore(sess, ckpt.model_checkpoint_path) else: sess.run(init) for i in range(num_episodes): episodeBuffer = [] # Reset environment and get first new observation sP = env.reset() s = processState(sP) d = False rAll = 0 j = 0 state = (np.zeros([1, h_size]), np.zeros([1, h_size])) # The Q-Network while j < max_epLength: # If the agent takes longer than 200 moves to reach either of the blocks, end the trial. j += 1 # Choose an action by greedily (with e chance of random action) from the Q-network if np.random.rand(1) < e: state1 = sess.run(mainQN.rnn_state, \ feed_dict={mainQN.scalarInput: [s / 255.0], mainQN.trainLength: 1, mainQN.state_in: state, mainQN.batch_size: 1}) a = np.random.randint(0, 4) else: a, state1 = sess.run([mainQN.predict, mainQN.rnn_state], \ feed_dict={mainQN.scalarInput: [s / 255.0], mainQN.trainLength: 1, \ mainQN.state_in: state, mainQN.batch_size: 1}) a = a[0] s1P, r, d = env.step(a) s1 = processState(s1P) total_steps += 1 episodeBuffer.append( np.reshape(np.array([s, a, r, s1, d]), [1, 5])) # Save the experience to our episode buffer. rAll += r s = s1 sP = s1P state = state1 if d == True: break bufferArray = np.array(episodeBuffer) jList.append(j) rList.append(rAll) # Periodically save the model. if len(rList) % summaryLength == 0 and len(rList) != 0: print(total_steps, np.mean(rList[-summaryLength:]), e) # saveToCenter(i, rList, jList, np.reshape(np.array(episodeBuffer), [sum(1 for _ in episodeBuffer), 5]), \ # summaryLength, h_size, sess, mainQN, time_per_step) print("Percent of succesful episodes: " + str(sum(rList) / num_episodes) + "%") | cs |
기존 포스팅과 같은 게임 환경을 사용했습니다. 단, Partial Observability 을 적용하도록 파라미터를 전달합니다.
env = gameEnv(partial=True,size=9) | cs |
CNN 신경망을 구성하는 부분입니다. 이 부분까지는 DQN 포스팅과 동일합니다.
class Qnetwork(): def __init__(self, h_size, rnn_cell, myScope): # The network recieves a frame from the game, flattened into an array. # It then resizes it and processes it through four convolutional layers. self.scalarInput = tf.placeholder(shape=[None, 21168], dtype=tf.float32) self.imageIn = tf.reshape(self.scalarInput, shape=[-1, 84, 84, 3]) self.conv1 = slim.convolution2d( \ inputs=self.imageIn, num_outputs=32, \ kernel_size=[8, 8], stride=[4, 4], padding='VALID', \ biases_initializer=None, scope=myScope + '_conv1') self.conv2 = slim.convolution2d( \ inputs=self.conv1, num_outputs=64, \ kernel_size=[4, 4], stride=[2, 2], padding='VALID', \ biases_initializer=None, scope=myScope + '_conv2') self.conv3 = slim.convolution2d( \ inputs=self.conv2, num_outputs=64, \ kernel_size=[3, 3], stride=[1, 1], padding='VALID', \ biases_initializer=None, scope=myScope + '_conv3') self.conv4 = slim.convolution2d( \ inputs=self.conv3, num_outputs=h_size, \ kernel_size=[7, 7], stride=[1, 1], padding='VALID', \ biases_initializer=None, scope=myScope + '_conv4') | cs |
trainLength 은 학습에 몇 개의 step 을 사용할지를 담아두는 변수입니다.
다음으로 batch_size 변수를 선언하고 conv4 에서 받은 1, 1, 512 배열을 1차원 배열로 펼치고 이를 batch_size, trainLength, h_size 으로 reshape 합니다.
self.trainLength = tf.placeholder(dtype=tf.int32) # We take the output from the final convolutional layer and send it to a recurrent layer. # The input must be reshaped into [batch x trace x units] for rnn processing, # and then returned to [batch x units] when sent through the upper levles. self.batch_size = tf.placeholder(dtype=tf.int32) self.convFlat = tf.reshape(slim.flatten(self.conv4), [self.batch_size, self.trainLength, h_size]) | cs |
파라미터로 받은 rnn_cell 으로 tf.nn.dynamic_rnn 함수를 이용해 RNN 신경망을 구성합니다.
input 은 CNN 의 output 을 reshape 한 convFlat 으로하고 초기 상태는 zero_state 으로 합니다.
마지막으로 RNN 신경망의 output 을 -1, h_size 배열로 reshape 합니다.
self.state_in = cell.zero_state(self.batch_size, tf.float32) self.rnn, self.rnn_state = tf.nn.dynamic_rnn( \ inputs=self.convFlat, cell=rnn_cell, dtype=tf.float32, initial_state=self.state_in, scope=myScope + '_rnn') self.rnn = tf.reshape(self.rnn, shape=[-1, h_size]) | cs |
다음으로 RNN 의 output 을 DQN 포스팅과 마찬가지로 Dueling DQN 방법을 이용해 predict 을 만듭니다.
# The output from the recurrent player is then split into separate Value and Advantage streams self.streamA, self.streamV = tf.split(self.rnn, 2, 1) self.AW = tf.Variable(tf.random_normal([h_size // 2, 4])) self.VW = tf.Variable(tf.random_normal([h_size // 2, 1])) self.Advantage = tf.matmul(self.streamA, self.AW) self.Value = tf.matmul(self.streamV, self.VW) self.salience = tf.gradients(self.Advantage, self.imageIn) # Then combine them together to get our final Q-values. self.Qout = self.Value + tf.subtract(self.Advantage, tf.reduce_mean(self.Advantage, axis=1, keep_dims=True)) self.predict = tf.argmax(self.Qout, 1) | cs |
DQN 포스팅과 마찬가지 방법으로 error 을 구합니다.
# Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values. self.targetQ = tf.placeholder(shape=[None], dtype=tf.float32) self.actions = tf.placeholder(shape=[None], dtype=tf.int32) self.actions_onehot = tf.one_hot(self.actions, 4, dtype=tf.float32) self.Q = tf.reduce_sum(tf.multiply(self.Qout, self.actions_onehot), axis=1) self.td_error = tf.square(self.targetQ - self.Q) | cs |
위에서 얻은 error의 뒤 절반만 사용하기 위해 maskA, maskB 를 만들어 A 는 0, B 는 1로 채운 후 에러와 곱해 maskB 와 곱해진 뒷 부분만 유효하게 만들어 loss 를 만들어 AdamOptimizer 을 이용하여 학습 하도록 합니다.
# In order to only propogate accurate gradients through the network, we will mask the first # half of the losses for each trace as per Lample & Chatlot 2016 self.maskA = tf.zeros([self.batch_size, self.trainLength // 2]) self.maskB = tf.ones([self.batch_size, self.trainLength // 2]) self.mask = tf.concat([self.maskA, self.maskB], 1) self.mask = tf.reshape(self.mask, [-1]) self.loss = tf.reduce_mean(self.td_error * self.mask) self.trainer = tf.train.AdamOptimizer(learning_rate=0.0001) self.updateModel = self.trainer.minimize(self.loss) | cs |
DQN 포스팅과 같은 experience_buffer 클래스입니다.
다른 부분은 위에서 설명하였다 싶이 연속된 데이터를 가져와야 되기 때문에 sample 함수 부분에서 버퍼에서 batch_size 크기만큼의 경험 list 을 가져온 후 각각의 경험 list 에서 무작위의 시작 인덱스인 point 에서 point 에 trace_length 을 더한 인덱스까지의 경험을 모두 버퍼에 담아 반환합니다.
class experience_buffer(): def __init__(self, buffer_size=1000): self.buffer = [] self.buffer_size = buffer_size def add(self, experience): if len(self.buffer) + 1 >= self.buffer_size: self.buffer[0:(1 + len(self.buffer)) - self.buffer_size] = [] self.buffer.append(experience) def sample(self, batch_size, trace_length): sampled_episodes = random.sample(self.buffer, batch_size) sampledTraces = [] for episode in sampled_episodes: episodeList = list(episode) point = np.random.randint(0, len(episodeList) + 1 - trace_length) sampledTraces.append(episodeList[point:point + trace_length]) sampledTraces = np.array(sampledTraces) return np.reshape(sampledTraces, [batch_size * trace_length, 5]) | cs |
DQN 포스팅과 거의 비슷하고 추가된 인자값 중 trace_length 는 학습 시 각 경험의 step 을 얼마나 길게 사용할지에 대한 변수이고 time_per_step 과 summaryLength 는 기록을 위한 인자값입니다.
#Setting the training parameters batch_size = 4 #How many experience traces to use for each training step. trace_length = 8 #How long each experience trace will be when training update_freq = 5 #How often to perform a training step. y = .99 #Discount factor on the target Q-values startE = 1 #Starting chance of random action endE = 0.1 #Final chance of random action anneling_steps = 10000 #How many steps of training to reduce startE to endE. num_episodes = 10000 #How many episodes of game environment to train network with. pre_train_steps = 10000 #How many steps of random actions before training begins. load_model = False #Whether to load a saved model. path = "./drqn" #The path to save our model to. h_size = 512 #The size of the final convolutional layer before splitting it into Advantage and Value streams. max_epLength = 50 #The max allowed length of our episode. time_per_step = 1 #Length of each step used in gif creation summaryLength = 100 #Number of epidoes to periodically save for analysis tau = 0.001 | cs |
신경망을 구성하는 부분입니다.
먼저 RNN cell 을 tf.contrib.rnn.BasicLSTMCell 을 이용해 만들고
main, target 신경망을 DQN 포스팅과 같이 각각 만듭니다.
# We define the cells for the primary and target q-networks cell = tf.contrib.rnn.BasicLSTMCell(num_units=h_size, state_is_tuple=True) cellT = tf.contrib.rnn.BasicLSTMCell(num_units=h_size, state_is_tuple=True) mainQN = Qnetwork(h_size, cell, 'main') targetQN = Qnetwork(h_size, cellT, 'target') | cs |
DQN 포스팅과 동일한 부분입니다.
trainables = tf.trainable_variables() targetOps = updateTargetGraph(trainables, tau) myBuffer = experience_buffer() # Set the rate of random action decrease. e = startE stepDrop = (startE - endE) / anneling_steps # create lists to contain total rewards and steps per episode jList = [] rList = [] total_steps = 0 | cs |
DQN 포스팅과 동일한 부분입니다.
with tf.Session() as sess: sess.run(init) updateTarget(targetOps, sess) # Set the target network to be equal to the primary network. for i in range(num_episodes): episodeBuffer = [] # Reset environment and get first new observation sP = env.reset() s = processState(sP) d = False rAll = 0 j = 0 | cs |
RNN 신경망의 초기화를 위한 state 을 만듭니다. 모두 0으로 초기화하는 방법을 사용합니다.
DQN 포스팅과 마찬가지로 e-greedy 와 pre_train_steps 을 사용합니다.
행동을 선택하는 부분은 DQN 포스팅과 같고,
다른 부분은 mainQN.rnn_state 을 실행시키는 부분입니다.
기존과 다르게 상태값을 255로 나눠 사용합니다. 상태값이 0~255 값이기 때문에 255로 나눠 0~1 사이의 실수값으로 만들 수 있습니다. 그리고 trainLength 와 batch_size 를 1로 하여 rnn_state 을 실행시켜 state1 을 얻습니다.
state = (np.zeros([1, h_size]), np.zeros([1, h_size])) # Reset the recurrent layer's hidden state # The Q-Network while j < max_epLength: j += 1 # Choose an action by greedily (with e chance of random action) from the Q-network if np.random.rand(1) < e or total_steps < pre_train_steps: state1 = sess.run(mainQN.rnn_state, \ feed_dict={mainQN.scalarInput: [s / 255.0], mainQN.trainLength: 1, mainQN.state_in: state, mainQN.batch_size: 1}) a = np.random.randint(0, 4) else: a, state1 = sess.run([mainQN.predict, mainQN.rnn_state], \ feed_dict={mainQN.scalarInput: [s / 255.0], mainQN.trainLength: 1, mainQN.state_in: state, mainQN.batch_size: 1}) a = a[0] | cs |
얻어진 행동으로 환경의 step 을 실행시킵니다. 해당 부분은 DQN 포스팅과 동일합니다.
s1P, r, d = env.step(a) s1 = processState(s1P) total_steps += 1 episodeBuffer.append(np.reshape(np.array([s, a, r, s1, d]), [1, 5])) | cs |
DQN 포스팅과 동일한 부분입니다.
if total_steps > pre_train_steps: if e > endE: e -= stepDrop if total_steps % (update_freq) == 0: updateTarget(targetOps, sess) | cs |
윗 부분의 RNN 초기 state 와 마찬가지로 state_train 을 만듭니다. 이번에는 배열의 크기가 batch_size, h_size 입니다.
버퍼에서 셈플을 가져와 trainBatch 에 담아
mainQN.predict 와 targetQN.Qout 을 실행시킵니다. 둘 다 input 은 셈플의 보상값입니다.
# Reset the recurrent layer's hidden state state_train = (np.zeros([batch_size, h_size]), np.zeros([batch_size, h_size])) trainBatch = myBuffer.sample(batch_size, trace_length) # Get a random batch of experiences. # Below we perform the Double-DQN update to the target Q-values Q1 = sess.run(mainQN.predict, feed_dict={ \ mainQN.scalarInput: np.vstack(trainBatch[:, 3] / 255.0), \ mainQN.trainLength: trace_length, mainQN.state_in: state_train, mainQN.batch_size: batch_size}) Q2 = sess.run(targetQN.Qout, feed_dict={ \ targetQN.scalarInput: np.vstack(trainBatch[:, 3] / 255.0), \ targetQN.trainLength: trace_length, targetQN.state_in: state_train, targetQN.batch_size: batch_size}) | cs |
DQN 포스팅과 같이 Double DQN 방법을 이용해 Q-value 을 얻습니다.
end_multiplier = -(trainBatch[:, 4] - 1) doubleQ = Q2[range(batch_size * trace_length), Q1] targetQ = trainBatch[:, 2] + (y * doubleQ * end_multiplier) | cs |
얻어진 Q-value 를 이용해 mainQN.updateModel 을 실행시켜 main 신경망을 학습시킵니다.
# Update the network with our target values. sess.run(mainQN.updateModel, \ feed_dict={mainQN.scalarInput: np.vstack(trainBatch[:, 0] / 255.0), mainQN.targetQ: targetQ, \ mainQN.actions: trainBatch[:, 1], mainQN.trainLength: trace_length, \ mainQN.state_in: state_train, mainQN.batch_size: batch_size}) | cs |
학습하여 저장한 모델을 가져와 테스트를 해보는 부분입니다. 이 부분에서는 학습을 하지 않습니다,
### Testing the network e = 0.01 #The chance of chosing a random action num_episodes = 10000 #How many episodes of game environment to train network with. load_model = True #Whether to load a saved model. path = "./drqn" #The path to save/load our model to/from. h_size = 512 #The size of the final convolutional layer before splitting it into Advantage and Value streams. h_size = 512 #The size of the final convolutional layer before splitting it into Advantage and Value streams. max_epLength = 50 #The max allowed length of our episode. time_per_step = 1 #Length of each step used in gif creation summaryLength = 100 #Number of epidoes to periodically save for analysis tf.reset_default_graph() cell = tf.contrib.rnn.BasicLSTMCell(num_units=h_size, state_is_tuple=True) cellT = tf.contrib.rnn.BasicLSTMCell(num_units=h_size, state_is_tuple=True) mainQN = Qnetwork(h_size, cell, 'main') targetQN = Qnetwork(h_size, cellT, 'target') init = tf.global_variables_initializer() saver = tf.train.Saver(max_to_keep=2) # create lists to contain total rewards and steps per episode jList = [] rList = [] total_steps = 0 # Make a path for our model to be saved in. if not os.path.exists(path): os.makedirs(path) ##Write the first line of the master log-file for the Control Center with open('./Center/log.csv', 'w') as myfile: wr = csv.writer(myfile, quoting=csv.QUOTE_ALL) wr.writerow(['Episode', 'Length', 'Reward', 'IMG', 'LOG', 'SAL']) # wr = csv.writer(open('./Center/log.csv', 'a'), quoting=csv.QUOTE_ALL) with tf.Session() as sess: if load_model == True: print('Loading Model...') ckpt = tf.train.get_checkpoint_state(path) saver.restore(sess, ckpt.model_checkpoint_path) else: sess.run(init) for i in range(num_episodes): episodeBuffer = [] # Reset environment and get first new observation sP = env.reset() s = processState(sP) d = False rAll = 0 j = 0 state = (np.zeros([1, h_size]), np.zeros([1, h_size])) # The Q-Network while j < max_epLength: # If the agent takes longer than 200 moves to reach either of the blocks, end the trial. j += 1 # Choose an action by greedily (with e chance of random action) from the Q-network if np.random.rand(1) < e: state1 = sess.run(mainQN.rnn_state, \ feed_dict={mainQN.scalarInput: [s / 255.0], mainQN.trainLength: 1, mainQN.state_in: state, mainQN.batch_size: 1}) a = np.random.randint(0, 4) else: a, state1 = sess.run([mainQN.predict, mainQN.rnn_state], \ feed_dict={mainQN.scalarInput: [s / 255.0], mainQN.trainLength: 1, \ mainQN.state_in: state, mainQN.batch_size: 1}) a = a[0] s1P, r, d = env.step(a) s1 = processState(s1P) total_steps += 1 episodeBuffer.append( np.reshape(np.array([s, a, r, s1, d]), [1, 5])) # Save the experience to our episode buffer. rAll += r s = s1 sP = s1P state = state1 if d == True: break bufferArray = np.array(episodeBuffer) jList.append(j) rList.append(rAll) # Periodically save the model. if len(rList) % summaryLength == 0 and len(rList) != 0: print(total_steps, np.mean(rList[-summaryLength:]), e) # saveToCenter(i, rList, jList, np.reshape(np.array(episodeBuffer), [sum(1 for _ in episodeBuffer), 5]), \ # summaryLength, h_size, sess, mainQN, time_per_step) print("Percent of succesful episodes: " + str(sum(rList) / num_episodes) + "%") | cs |