이번 포스팅에서는 Partial Observability 상태에서 Reinforcement Learning 을 하기위해 RNN 을 사용하는 방법입니다.
지금까지의 Reinforcement Learning 포스팅에서는 에이전트가 환경의 모든 정보를 가지고 있었습니다. 쉽게 설명하자면 스타크래프트에서 show me the money 치트를 사용한 것과 같은 환경을 준 것입니다. 이렇게 에이전트에 최선의 행동을 선택하기위해 필요한 모든 정보를 에이전트에서 전달하는 환경을 Markov Decision Processes 라 합니다.
반면에 에이전트에서 제한된 상태를 전달해주는 환경을 Partially Observable Markov Decision Processes 라 합니다.사실 실제 환경은 대부분 이렇게 부분적인 환경만을 관찰할 수 있지요. 이런 부분적인 환경만을 관찰할 수 있을 때 에이전트에 최선의 행동을 선택하기 위해서는 시간을 포함한 다양한 정보를 주는 방법을 통해 해결할 수 있습니다. 예를 들어 상대방의 진영에서 드랍쉽이 날라가는걸 관찰한 후 드랍쉽이 시야가 닿지 않는 곳으로 가버렸다면 드랍쉽이 환경에서 완전히 사라진 것이 아니라 드랍쉽의 속도를 생각하여 언제쯤 내 진영에 도착할지 추측할 수 있을 것입니다.
이런 시간적으로 연속된 정보를 처리하기 위해서는 RNN 방식을 사용하는 것이 효과적일 것입니다. 그리고 RNN 을 사용하는 Q-Networks 를 Deep Recurrent Q-Learning 라 부릅니다. https://arxiv.org/abs/1507.06527
다음으로 DQN 포스팅에서 experience buffer 을 사용했는데 이 버퍼에 경험을 저장하는 방법을 조정할 필요가 있습니다. 이 포스팅에서는 신경망이 연속된 정보를 처리하기 원하기 때문에 버퍼에서 무작위 데이터를 꺼내 사용할 수 없습니다. 대신 무작위 데이터를 꺼내오지만 무작위 데이터로부터 8 step 의 기록을 붙여서 가져올 것이빈다.
마지막으로 이 포스팅에서는 https://arxiv.org/abs/1609.05521 이 논문에서 사용한 기법을 사용했습니다. 이 논문에서는 에이전트를 학습할 때 모든 gradient를 back propagation 에 사용하는 대신에 gradient 의 절반을 나누어 마지막 절반만을 사용했고 이를 통해 더 의미있는 데이터만으로 학습시킬 수 있습니다.
self.Q) # In order to only propogate accurate gradients through the network, we will mask the first # half of the losses for each trace as per Lample & Chatlot 2016 self.maskA = tf.zeros([self.batch_size, self.trainLength // 2]) self.maskB = tf.ones([self.batch_size, self.trainLength // 2]) self.mask = tf.concat([self.maskA, self.maskB], 1) self.mask = tf.reshape(self.mask, [-1]) self.loss = tf.reduce_mean(self.td_error * self.mask) self.trainer = tf.train.AdamOptimizer(learning_rate=0.0001) self.updateModel = self.trainer.minimize(self.loss) class experience_buffer(): def __init__(self, buffer_size=1000): self.buffer = [] self.buffer_size = buffer_size def add(self, experience): if len(self.buffer) + 1 >= self.buffer_size: self.buffer[0:(1 + len(self.buffer)) - self.buffer_size] = [] self.buffer.append(experience) def sample(self, batch_size, trace_length): sampled_episodes = random.sample(self.buffer, batch_size) sampledTraces = [] for episode in sampled_episodes: episodeList = list(episode) point = np.random.randint(0, len(episodeList) + 1 - trace_length) sampledTraces.append(episodeList[point:point + trace_length]) sampledTraces = np.array(sampledTraces) return np.reshape(sampledTraces, [batch_size * trace_length, 5]) ### Training the network #Setting the training parameters batch_size = 4 #How many experience traces to use for each training step. trace_length = 8 #How long each experience trace will be when training update_freq = 5 #How often to perform a training step. y = .99 #Discount factor on the target Q-values startE = 1 #Starting chance of random action endE = 0.1 #Final chance of random action anneling_steps = 10000 #How many steps of training to reduce startE to endE. num_episodes = 10000 #How many episodes of game environment to train network with. pre_train_steps = 10000 #How many steps of random actions before training begins. load_model = False #Whether to load a saved model. path = "./drqn" #The path to save our model to. h_size = 512 #The size of the final convolutional layer before splitting it into Advantage and Value streams. max_epLength = 50 #The max allowed length of our episode. time_per_step = 1 #Length of each step used in gif creation summaryLength = 100 #Number of epidoes to periodically save for analysis tau = 0.001 tf.reset_default_graph() # We define the cells for the primary and target q-networks cell = tf.contrib.rnn.BasicLSTMCell(num_units=h_size, state_is_tuple=True) cellT = tf.contrib.rnn.BasicLSTMCell(num_units=h_size, state_is_tuple=True) mainQN = Qnetwork(h_size, cell, 'main') targetQN = Qnetwork(h_size, cellT, 'target') init = tf.global_variables_initializer() saver = tf.train.Saver(max_to_keep=5) trainables = tf.trainable_variables() targetOps = updateTargetGraph(trainables, tau) myBuffer = experience_buffer() # Set the rate of random action decrease. e = startE stepDrop = (startE - endE) / anneling_steps # create lists to contain total rewards and steps per episode jList = [] rList = [] total_steps = 0 # Make a path for our model to be saved in. if not os.path.exists(path): os.makedirs(path) ##Write the first line of the master log-file for the Control Center with open('./Center/log.csv', 'w') as myfile: wr = csv.writer(myfile, quoting=csv.QUOTE_ALL) wr.writerow(['Episode', 'Length', 'Reward', 'IMG', 'LOG', 'SAL']) with tf.Session() as sess: if load_model == True: print('Loading Model...') ckpt = tf.train.get_checkpoint_state(path) saver.restore(sess, ckpt.model_checkpoint_path) sess.run(init) updateTarget(targetOps, sess) # Set the target network to be equal to the primary network. for i in range(num_episodes): episodeBuffer = [] # Reset environment and get first new observation sP = env.reset() s = processState(sP) d = False rAll = 0 j = 0 state = (np.zeros([1, h_size]), np.zeros([1, h_size])) # Reset the recurrent layer's hidden state # The Q-Network while j < max_epLength: j += 1 # Choose an action by greedily (with e chance of random action) from the Q-network if np.random.rand(1) < e or total_steps < pre_train_steps: state1 = sess.run(mainQN.rnn_state, \ feed_dict={mainQN.scalarInput: [s / 255.0], mainQN.trainLength: 1, mainQN.state_in: state, mainQN.batch_size: 1}) a = np.random.randint(0, 4) else: a, state1 = sess.run([mainQN.predict, mainQN.rnn_state], \ feed_dict={mainQN.scalarInput: [s / 255.0], mainQN.trainLength: 1, mainQN.state_in: state, mainQN.batch_size: 1}) a = a[0] s1P, r, d = env.step(a) s1 = processState(s1P) total_steps += 1 episodeBuffer.append(np.reshape(np.array([s, a, r, s1, d]), [1, 5])) if total_steps > pre_train_steps: if e > endE: e -= stepDrop if total_steps % (update_freq) == 0: updateTarget(targetOps, sess) # Reset the recurrent layer's hidden state state_train = (np.zeros([batch_size, h_size]), np.zeros([batch_size, h_size])) trainBatch = myBuffer.sample(batch_size, trace_length) # Get a random batch of experiences. # Below we perform the Double-DQN update to the target Q-values Q1 = sess.run(mainQN.predict, feed_dict={ \ mainQN.scalarInput: np.vstack(trainBatch[:, 3] / 255.0), \ mainQN.trainLength: trace_length, mainQN.state_in: state_train, mainQN.batch_size: batch_size}) Q2 = sess.run(targetQN.Qout, feed_dict={ \ targetQN.scalarInput: np.vstack(trainBatch[:, 3] / 255.0), \ targetQN.trainLength: trace_length, targetQN.state_in: state_train, targetQN.batch_size: batch_size}) end_multiplier = -(trainBatch[:, 4] - 1) doubleQ = Q2[range(batch_size * trace_length), Q1] targetQ = trainBatch[:, 2] + (y * doubleQ * end_multiplier) # Update the network with our target values. sess.run(mainQN.updateModel, \ feed_dict={mainQN.scalarInput: np.vstack(trainBatch[:, 0] / 255.0), mainQN.targetQ: targetQ, \ mainQN.actions: trainBatch[:, 1], mainQN.trainLength: trace_length, \ mainQN.state_in: state_train, mainQN.batch_size: batch_size}) rAll += r s = s1 sP = s1P state = state1 if d == True: break # Add the episode to the experience buffer bufferArray = np.array(episodeBuffer) episodeBuffer = zip(bufferArray) myBuffer.add(bufferArray) jList.append(j) rList.append(rAll) # Periodically save the model. if i % 1000 == 0 and i != 0: saver.save(sess, path + '/model-' + str(i) + '.cptk') print("Saved Model") if len(rList) % summaryLength == 0 and len(rList) != 0: print(total_steps, np.mean(rList[-summaryLength:]), e) # saveToCenter(i, rList, jList, np.reshape(np.array(episodeBuffer), [sum(1 for _ in episodeBuffer), 5]), \ # summaryLength, h_size, sess, mainQN, time_per_step) saver.save(sess, path + '/model-' + str(i) + '.cptk') ### Testing the network e = 0.01 #The chance of chosing a random action num_episodes = 10000 #How many episodes of game environment to train network with. load_model = True #Whether to load a saved model. path = "./drqn" #The path to save/load our model to/from. h_size = 512 #The size of the final convolutional layer before splitting it into Advantage and Value streams. h_size = 512 #The size of the final convolutional layer before splitting it into Advantage and Value streams. max_epLength = 50 #The max allowed length of our episode. time_per_step = 1 #Length of each step used in gif creation summaryLength = 100 #Number of epidoes to periodically save for analysis tf.reset_default_graph() cell = tf.contrib.rnn.BasicLSTMCell(num_units=h_size, state_is_tuple=True) cellT = tf.contrib.rnn.BasicLSTMCell(num_units=h_size, state_is_tuple=True) mainQN = Qnetwork(h_size, cell, 'main') targetQN = Qnetwork(h_size, cellT, 'target') init = tf.global_variables_initializer() saver = tf.train.Saver(max_to_keep=2) # create lists to contain total rewards and steps per episode jList = [] rList = [] total_steps = 0 # Make a path for our model to be saved in. if not os.path.exists(path): os.makedirs(path) ##Write the first line of the master log-file for the Control Center with open('./Center/log.csv', 'w') as myfile: wr = csv.writer(myfile, quoting=csv.QUOTE_ALL) wr.writerow(['Episode', 'Length', 'Reward', 'IMG', 'LOG', 'SAL']) # wr = csv.writer(open('./Center/log.csv', 'a'), quoting=csv.QUOTE_ALL) with tf.Session() as sess: if load_model == True: print('Loading Model...') ckpt = tf.train.get_checkpoint_state(path) saver.restore(sess, ckpt.model_checkpoint_path) else: sess.run(init) for i in range(num_episodes): episodeBuffer = [] # Reset environment and get first new observation sP = env.reset() s = processState(sP) d = False rAll = 0 j = 0 state = (np.zeros([1, h_size]), np.zeros([1, h_size])) # The Q-Network while j < max_epLength: # If the agent takes longer than 200 moves to reach either of the blocks, end the trial. j += 1 # Choose an action by greedily (with e chance of random action) from the Q-network if np.random.rand(1) < e: state1 = sess.run(mainQN.rnn_state, \ feed_dict={mainQN.scalarInput: [s / 255.0], mainQN.trainLength: 1, mainQN.state_in: state, mainQN.batch_size: 1}) a = np.random.randint(0, 4) else: a, state1 = sess.run([mainQN.predict, mainQN.rnn_state], \ feed_dict={mainQN.scalarInput: [s / 255.0], mainQN.trainLength: 1, \ mainQN.state_in: state, mainQN.batch_size: 1}) a = a[0] s1P, r, d = env.step(a) s1 = processState(s1P) total_steps += 1 episodeBuffer.append( np.reshape(np.array([s, a, r, s1, d]), [1, 5])) # Save the experience to our episode buffer. rAll += r s = s1 sP = s1P state = state1 if d == True: break bufferArray = np.array(episodeBuffer) jList.append(j) rList.append(rAll) # Periodically save the model. if len(rList) % summaryLength == 0 and len(rList) != 0: print(total_steps, np.mean(rList[-summaryLength:]), e) # saveToCenter(i, rList, jList, np.reshape(np.array(episodeBuffer), [sum(1 for _ in episodeBuffer), 5]), \ # summaryLength, h_size, sess, mainQN, time_per_step) print("Percent of succesful episodes: " + str(sum(rList) / num_episodes) + "%") | cs |
기존 포스팅과 같은 게임 환경을 사용했습니다. 단, Partial Observability 을 적용하도록 파라미터를 전달합니다.
env = gameEnv(partial=True,size=9) | cs |
CNN 신경망을 구성하는 부분입니다. 이 부분까지는 DQN 포스팅과 동일합니다.
class Qnetwork(): def __init__(self, h_size, rnn_cell, myScope): # The network recieves a frame from the game, flattened into an array. # It then resizes it and processes it through four convolutional layers. self.scalarInput = tf.placeholder(shape=[None, 21168], dtype=tf.float32) self.imageIn = tf.reshape(self.scalarInput, shape=[-1, 84, 84, 3]) self.conv1 = slim.convolution2d( \ inputs=self.imageIn, num_outputs=32, \ kernel_size=[8, 8], stride=[4, 4], padding='VALID', \ biases_initializer=None, scope=myScope + '_conv1') self.conv2 = slim.convolution2d( \ inputs=self.conv1, num_outputs=64, \ kernel_size=[4, 4], stride=[2, 2], padding='VALID', \ biases_initializer=None, scope=myScope + '_conv2') self.conv3 = slim.convolution2d( \ inputs=self.conv2, num_outputs=64, \ kernel_size=[3, 3], stride=[1, 1], padding='VALID', \ biases_initializer=None, scope=myScope + '_conv3') self.conv4 = slim.convolution2d( \ inputs=self.conv3, num_outputs=h_size, \ kernel_size=[7, 7], stride=[1, 1], padding='VALID', \ biases_initializer=None, scope=myScope + '_conv4') | cs |
trainLength 은 학습에 몇 개의 step 을 사용할지를 담아두는 변수입니다.
다음으로 batch_size 변수를 선언하고 conv4 에서 받은 1, 1, 512 배열을 1차원 배열로 펼치고 이를 batch_size, trainLength, h_size 으로 reshape 합니다.
self.trainLength = tf.placeholder(dtype=tf.int32) # We take the output from the final convolutional layer and send it to a recurrent layer. # The input must be reshaped into [batch x trace x units] for rnn processing, # and then returned to [batch x units] when sent through the upper levles. self.batch_size = tf.placeholder(dtype=tf.int32) self.convFlat = tf.reshape(slim.flatten(self.conv4), [self.batch_size, self.trainLength, h_size]) | cs |
파라미터로 받은 rnn_cell 으로 tf.nn.dynamic_rnn 함수를 이용해 RNN 신경망을 구성합니다.
input 은 CNN 의 output 을 reshape 한 convFlat 으로하고 초기 상태는 zero_state 으로 합니다.
마지막으로 RNN 신경망의 output 을 -1, h_size 배열로 reshape 합니다.
self.state_in = cell.zero_state(self.batch_size, tf.float32) self.rnn, self.rnn_state = tf.nn.dynamic_rnn( \ inputs=self.convFlat, cell=rnn_cell, dtype=tf.float32, initial_state=self.state_in, scope=myScope + '_rnn') self.rnn = tf.reshape(self.rnn, shape=[-1, h_size]) | cs |
다음으로 RNN 의 output 을 DQN 포스팅과 마찬가지로 Dueling DQN 방법을 이용해 predict 을 만듭니다.
# The output from the recurrent player is then split into separate Value and Advantage streams self.streamA, self.streamV = tf.split(self.rnn, 2, 1) self.AW = tf.Variable(tf.random_normal([h_size // 2, 4])) self.VW = tf.Variable(tf.random_normal([h_size // 2, 1])) self.Advantage = tf.matmul(self.streamA, self.AW) self.Value = tf.matmul(self.streamV, self.VW) self.salience = tf.gradients(self.Advantage, self.imageIn) # Then combine them together to get our final Q-values. self.Qout = self.Value + tf.subtract(self.Advantage, tf.reduce_mean(self.Advantage, axis=1, keep_dims=True)) self.predict = tf.argmax(self.Qout, 1) | cs |
DQN 포스팅과 마찬가지 방법으로 error 을 구합니다.
# Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values. self.targetQ = tf.placeholder(shape=[None], dtype=tf.float32) self.actions = tf.placeholder(shape=[None], dtype=tf.int32) self.actions_onehot = tf.one_hot(self.actions, 4, dtype=tf.float32) self.Q = tf.reduce_sum(tf.multiply(self.Qout, self.actions_onehot), axis=1) self.td_error = tf.square(self.targetQ - self.Q) | cs |
위에서 얻은 error의 뒤 절반만 사용하기 위해 maskA, maskB 를 만들어 A 는 0, B 는 1로 채운 후 에러와 곱해 maskB 와 곱해진 뒷 부분만 유효하게 만들어 loss 를 만들어 AdamOptimizer 을 이용하여 학습 하도록 합니다.
# In order to only propogate accurate gradients through the network, we will mask the first # half of the losses for each trace as per Lample & Chatlot 2016 self.maskA = tf.zeros([self.batch_size, self.trainLength // 2]) self.maskB = tf.ones([self.batch_size, self.trainLength // 2]) self.mask = tf.concat([self.maskA, self.maskB], 1) self.mask = tf.reshape(self.mask, [-1]) self.loss = tf.reduce_mean(self.td_error * self.mask) self.trainer = tf.train.AdamOptimizer(learning_rate=0.0001) self.updateModel = self.trainer.minimize(self.loss) | cs |
DQN 포스팅과 같은 experience_buffer 클래스입니다.
다른 부분은 위에서 설명하였다 싶이 연속된 데이터를 가져와야 되기 때문에 sample 함수 부분에서 버퍼에서 batch_size 크기만큼의 경험 list 을 가져온 후 각각의 경험 list 에서 무작위의 시작 인덱스인 point 에서 point 에 trace_length 을 더한 인덱스까지의 경험을 모두 버퍼에 담아 반환합니다.
class experience_buffer(): def __init__(self, buffer_size=1000): self.buffer = [] self.buffer_size = buffer_size def add(self, experience): if len(self.buffer) + 1 >= self.buffer_size: self.buffer[0:(1 + len(self.buffer)) - self.buffer_size] = [] self.buffer.append(experience) def sample(self, batch_size, trace_length): sampled_episodes = random.sample(self.buffer, batch_size) sampledTraces = [] for episode in sampled_episodes: episodeList = list(episode) point = np.random.randint(0, len(episodeList) + 1 - trace_length) sampledTraces.append(episodeList[point:point + trace_length]) sampledTraces = np.array(sampledTraces) return np.reshape(sampledTraces, [batch_size * trace_length, 5]) | cs |
DQN 포스팅과 거의 비슷하고 추가된 인자값 중 trace_length 는 학습 시 각 경험의 step 을 얼마나 길게 사용할지에 대한 변수이고 time_per_step 과 summaryLength 는 기록을 위한 인자값입니다.
#Setting the training parameters batch_size = 4 #How many experience traces to use for each training step. trace_length = 8 #How long each experience trace will be when training update_freq = 5 #How often to perform a training step. y = .99 #Discount factor on the target Q-values startE = 1 #Starting chance of random action endE = 0.1 #Final chance of random action anneling_steps = 10000 #How many steps of training to reduce startE to endE. num_episodes = 10000 #How many episodes of game environment to train network with. pre_train_steps = 10000 #How many steps of random actions before training begins. load_model = False #Whether to load a saved model. path = "./drqn" #The path to save our model to. h_size = 512 #The size of the final convolutional layer before splitting it into Advantage and Value streams. max_epLength = 50 #The max allowed length of our episode. time_per_step = 1 #Length of each step used in gif creation summaryLength = 100 #Number of epidoes to periodically save for analysis tau = 0.001 | cs |
신경망을 구성하는 부분입니다.
먼저 RNN cell 을 tf.contrib.rnn.BasicLSTMCell 을 이용해 만들고
main, target 신경망을 DQN 포스팅과 같이 각각 만듭니다.
# We define the cells for the primary and target q-networks cell = tf.contrib.rnn.BasicLSTMCell(num_units=h_size, state_is_tuple=True) cellT = tf.contrib.rnn.BasicLSTMCell(num_units=h_size, state_is_tuple=True) mainQN = Qnetwork(h_size, cell, 'main') targetQN = Qnetwork(h_size, cellT, 'target') | cs |
DQN 포스팅과 동일한 부분입니다.
trainables = tf.trainable_variables() targetOps = updateTargetGraph(trainables, tau) myBuffer = experience_buffer() # Set the rate of random action decrease. e = startE stepDrop = (startE - endE) / anneling_steps # create lists to contain total rewards and steps per episode jList = [] rList = [] total_steps = 0 | cs |
DQN 포스팅과 동일한 부분입니다.
with tf.Session() as sess: sess.run(init) updateTarget(targetOps, sess) # Set the target network to be equal to the primary network. for i in range(num_episodes): episodeBuffer = [] # Reset environment and get first new observation sP = env.reset() s = processState(sP) d = False rAll = 0 j = 0 | cs |
RNN 신경망의 초기화를 위한 state 을 만듭니다. 모두 0으로 초기화하는 방법을 사용합니다.
DQN 포스팅과 마찬가지로 e-greedy 와 pre_train_steps 을 사용합니다.
행동을 선택하는 부분은 DQN 포스팅과 같고,
다른 부분은 mainQN.rnn_state 을 실행시키는 부분입니다.
기존과 다르게 상태값을 255로 나눠 사용합니다. 상태값이 0~255 값이기 때문에 255로 나눠 0~1 사이의 실수값으로 만들 수 있습니다. 그리고 trainLength 와 batch_size 를 1로 하여 rnn_state 을 실행시켜 state1 을 얻습니다.
state = (np.zeros([1, h_size]), np.zeros([1, h_size])) # Reset the recurrent layer's hidden state # The Q-Network while j < max_epLength: j += 1 # Choose an action by greedily (with e chance of random action) from the Q-network if np.random.rand(1) < e or total_steps < pre_train_steps: state1 = sess.run(mainQN.rnn_state, \ feed_dict={mainQN.scalarInput: [s / 255.0], mainQN.trainLength: 1, mainQN.state_in: state, mainQN.batch_size: 1}) a = np.random.randint(0, 4) else: a, state1 = sess.run([mainQN.predict, mainQN.rnn_state], \ feed_dict={mainQN.scalarInput: [s / 255.0], mainQN.trainLength: 1, mainQN.state_in: state, mainQN.batch_size: 1}) a = a[0] | cs |
얻어진 행동으로 환경의 step 을 실행시킵니다. 해당 부분은 DQN 포스팅과 동일합니다.
s1P, r, d = env.step(a) s1 = processState(s1P) total_steps += 1 episodeBuffer.append(np.reshape(np.array([s, a, r, s1, d]), [1, 5])) | cs |
DQN 포스팅과 동일한 부분입니다.
if total_steps > pre_train_steps: if e > endE: e -= stepDrop if total_steps % (update_freq) == 0: updateTarget(targetOps, sess) | cs |
윗 부분의 RNN 초기 state 와 마찬가지로 state_train 을 만듭니다. 이번에는 배열의 크기가 batch_size, h_size 입니다.
버퍼에서 셈플을 가져와 trainBatch 에 담아
mainQN.predict 와 targetQN.Qout 을 실행시킵니다. 둘 다 input 은 셈플의 보상값입니다.
# Reset the recurrent layer's hidden state state_train = (np.zeros([batch_size, h_size]), np.zeros([batch_size, h_size])) trainBatch = myBuffer.sample(batch_size, trace_length) # Get a random batch of experiences. # Below we perform the Double-DQN update to the target Q-values Q1 = sess.run(mainQN.predict, feed_dict={ \ mainQN.scalarInput: np.vstack(trainBatch[:, 3] / 255.0), \ mainQN.trainLength: trace_length, mainQN.state_in: state_train, mainQN.batch_size: batch_size}) Q2 = sess.run(targetQN.Qout, feed_dict={ \ targetQN.scalarInput: np.vstack(trainBatch[:, 3] / 255.0), \ targetQN.trainLength: trace_length, targetQN.state_in: state_train, targetQN.batch_size: batch_size}) | cs |
DQN 포스팅과 같이 Double DQN 방법을 이용해 Q-value 을 얻습니다.
end_multiplier = -(trainBatch[:, 4] - 1) doubleQ = Q2[range(batch_size * trace_length), Q1] targetQ = trainBatch[:, 2] + (y * doubleQ * end_multiplier) | cs |
얻어진 Q-value 를 이용해 mainQN.updateModel 을 실행시켜 main 신경망을 학습시킵니다.
# Update the network with our target values. sess.run(mainQN.updateModel, \ feed_dict={mainQN.scalarInput: np.vstack(trainBatch[:, 0] / 255.0), mainQN.targetQ: targetQ, \ mainQN.actions: trainBatch[:, 1], mainQN.trainLength: trace_length, \ mainQN.state_in: state_train, mainQN.batch_size: batch_size}) | cs |
학습하여 저장한 모델을 가져와 테스트를 해보는 부분입니다. 이 부분에서는 학습을 하지 않습니다,
### Testing the network e = 0.01 #The chance of chosing a random action num_episodes = 10000 #How many episodes of game environment to train network with. load_model = True #Whether to load a saved model. path = "./drqn" #The path to save/load our model to/from. h_size = 512 #The size of the final convolutional layer before splitting it into Advantage and Value streams. h_size = 512 #The size of the final convolutional layer before splitting it into Advantage and Value streams. max_epLength = 50 #The max allowed length of our episode. time_per_step = 1 #Length of each step used in gif creation summaryLength = 100 #Number of epidoes to periodically save for analysis tf.reset_default_graph() cell = tf.contrib.rnn.BasicLSTMCell(num_units=h_size, state_is_tuple=True) cellT = tf.contrib.rnn.BasicLSTMCell(num_units=h_size, state_is_tuple=True) mainQN = Qnetwork(h_size, cell, 'main') targetQN = Qnetwork(h_size, cellT, 'target') init = tf.global_variables_initializer() saver = tf.train.Saver(max_to_keep=2) # create lists to contain total rewards and steps per episode jList = [] rList = [] total_steps = 0 # Make a path for our model to be saved in. if not os.path.exists(path): os.makedirs(path) ##Write the first line of the master log-file for the Control Center with open('./Center/log.csv', 'w') as myfile: wr = csv.writer(myfile, quoting=csv.QUOTE_ALL) wr.writerow(['Episode', 'Length', 'Reward', 'IMG', 'LOG', 'SAL']) # wr = csv.writer(open('./Center/log.csv', 'a'), quoting=csv.QUOTE_ALL) with tf.Session() as sess: if load_model == True: print('Loading Model...') ckpt = tf.train.get_checkpoint_state(path) saver.restore(sess, ckpt.model_checkpoint_path) else: sess.run(init) for i in range(num_episodes): episodeBuffer = [] # Reset environment and get first new observation sP = env.reset() s = processState(sP) d = False rAll = 0 j = 0 state = (np.zeros([1, h_size]), np.zeros([1, h_size])) # The Q-Network while j < max_epLength: # If the agent takes longer than 200 moves to reach either of the blocks, end the trial. j += 1 # Choose an action by greedily (with e chance of random action) from the Q-network if np.random.rand(1) < e: state1 = sess.run(mainQN.rnn_state, \ feed_dict={mainQN.scalarInput: [s / 255.0], mainQN.trainLength: 1, mainQN.state_in: state, mainQN.batch_size: 1}) a = np.random.randint(0, 4) else: a, state1 = sess.run([mainQN.predict, mainQN.rnn_state], \ feed_dict={mainQN.scalarInput: [s / 255.0], mainQN.trainLength: 1, \ mainQN.state_in: state, mainQN.batch_size: 1}) a = a[0] s1P, r, d = env.step(a) s1 = processState(s1P) total_steps += 1 episodeBuffer.append( np.reshape(np.array([s, a, r, s1, d]), [1, 5])) # Save the experience to our episode buffer. rAll += r s = s1 sP = s1P state = state1 if d == True: break bufferArray = np.array(episodeBuffer) jList.append(j) rList.append(rAll) # Periodically save the model. if len(rList) % summaryLength == 0 and len(rList) != 0: print(total_steps, np.mean(rList[-summaryLength:]), e) # saveToCenter(i, rList, jList, np.reshape(np.array(episodeBuffer), [sum(1 for _ in episodeBuffer), 5]), \ # summaryLength, h_size, sess, mainQN, time_per_step) print("Percent of succesful episodes: " + str(sum(rList) / num_episodes) + "%") | cs |