完整代碼:https://github.com/zle1992/Reinforcement_Learning_Gamegit
Policy Gradient 能夠直接預測出動做,也能夠預測連續動做,可是沒法單步更新。github
QLearning 先預測出Q值,根據Q值選動做,沒法預測連續動做、或者動做種類多的狀況,可是能夠單步更新。網絡
一句話歸納 Actor Critic 方法:session
結合了 Policy Gradient (Actor) 和 Function Approximation (Critic) 的方法. app
Actor
基於機率選行爲 Critic
基於 Actor
的行爲評判行爲的得分, dom
Actor
根據 Critic
的評分修改選行爲的機率.函數
Actor Critic 方法的優點: 能夠進行單步更新, 比傳統的 Policy Gradient 要快.oop
Actor Critic 方法的劣勢: 取決於 Critic 的價值判斷, 可是 Critic 難收斂, 再加上 Actor 的更新, 就更難收斂. 爲了解決收斂問題, Google Deepmind 提出了 Actor Critic
升級版 Deep Deterministic Policy Gradient
. 後者融合了 DQN 的優點, 解決了收斂難的問題.學習
Actor Critic 方法與Policy Gradinet的區別:ui
Policy Gradinet 中的梯度降低 :
grad[logPi(s,a) * v_t]
其中v_t是真實的reward ,經過記錄每一個epoch的每個state,action,reward獲得。
而Actor中的v_t 是td_error 由Critic估計獲得,不必定準確哦。
Actor Critic 方法與DQN的區別:
DQN 評價網絡與動做網絡實際上是一個網絡,只是採用了TD的方法,用滯後的網絡去評價當前的動做。
Actor-Critic 就是在求解策略的同時用值函數進行輔助,用估計的值函數替代採樣的reward,提升樣本利用率。
Q-learning 是一種基於值函數估計的強化學習方法,Policy Gradient是一種策略搜索強化學習方法
Critic估計td_error跟DQN同樣,用到了貝爾曼方程,
貝爾曼方程 :
Critic利用的是V函數的貝爾曼方程,來獲得TD_error,
gradient = grad[r + gamma * V(s_) - V(s)]
Q-learning 利用的是Q函數的貝爾曼方程,來更新Q函數。
q_target = r + gamma * maxq(s_next)
q_eval = maxq(s)
Actor網絡的輸入(st,at,TDerror)
Actor 網絡與policy gradient 差很少,多分類網絡,在算loss時候,policy gradient須要乘一個權重Vt,而Vt是根據回報R 累計計算的。
在Actor中,在算loss時候,loss的權重是TDerror
TDerror是Critic網絡計算出來的。
Critic網絡的輸入(st,vt+1,r),輸出TDerror
V_eval = network(st)
# TD_error = (r+gamma*V_next) - V_eval
學習的時候輸入:(st, r, st+1)
vt+1 = network(st+1)
Critic網絡(st,vt+1,r)
ACNetwork.py
1 import os 2 import numpy as np 3 import tensorflow as tf 4 from abc import ABCMeta, abstractmethod 5 np.random.seed(1) 6 tf.set_random_seed(1) 7 8 import logging # 寮曞靉logging妯″潡 9 logging.basicConfig(level=logging.DEBUG, 10 format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s') # logging.basicConfig鍑芥暟瀵規棩蹇楃殑杈撳嚭鏍煎紡鍙婃柟寮忓仛鐩稿叧閰嶇疆 11 # 鐢變簬鏃ュ織鍩烘湰閰嶇疆涓駭鍒緗負DEBUG錛屾墍浠ヤ竴涓嬫墦鍗頒俊鎭皢浼氬叏閮ㄦ樉紺哄湪鎺у埗鍙頒笂 12 13 tfconfig = tf.ConfigProto() 14 tfconfig.gpu_options.allow_growth = True 15 session = tf.Session(config=tfconfig) 16 17 18 class ACNetwork(object): 19 __metaclass__ = ABCMeta 20 """docstring for ACNetwork""" 21 def __init__(self, 22 n_actions, 23 n_features, 24 learning_rate, 25 memory_size, 26 reward_decay, 27 output_graph, 28 log_dir, 29 model_dir, 30 ): 31 super(ACNetwork, self).__init__() 32 33 self.n_actions = n_actions 34 self.n_features = n_features 35 self.learning_rate=learning_rate 36 self.gamma=reward_decay 37 self.memory_size =memory_size 38 self.output_graph=output_graph 39 self.lr =learning_rate 40 41 self.log_dir = log_dir 42 43 self.model_dir = model_dir 44 # total learning step 45 self.learn_step_counter = 0 46 47 48 self.s = tf.placeholder(tf.float32,[None]+self.n_features,name='s') 49 self.s_next = tf.placeholder(tf.float32,[None]+self.n_features,name='s_next') 50 51 self.r = tf.placeholder(tf.float32,[None,],name='r') 52 self.a = tf.placeholder(tf.int32,[None,],name='a') 53 54 55 56 57 58 with tf.variable_scope('Critic'): 59 60 self.v = self._build_c_net(self.s, scope='v', trainable=True) 61 self.v_ = self._build_c_net(self.s_next, scope='v_next', trainable=False) 62 63 self.td_error =self.r + self.gamma * self.v_ - self.v 64 self.loss_critic = tf.square(self.td_error) 65 with tf.variable_scope('train'): 66 self.train_op_critic = tf.train.AdamOptimizer(self.lr).minimize(self.loss_critic) 67 68 69 70 with tf.variable_scope('Actor'): 71 self.acts_prob = self._build_a_net(self.s, scope='actor_net', trainable=True) 72 # this is negative log of chosen action 73 log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.acts_prob, labels=self.a) 74 75 self.loss_actor = tf.reduce_mean(log_prob*self.td_error) 76 with tf.variable_scope('train'): 77 self.train_op_actor = tf.train.AdamOptimizer(self.lr).minimize(-self.loss_actor) 78 79 80 self.sess = tf.Session() 81 if self.output_graph: 82 tf.summary.FileWriter(self.log_dir,self.sess.graph) 83 84 self.sess.run(tf.global_variables_initializer()) 85 86 self.cost_his =[0] 87 88 89 self.saver = tf.train.Saver() 90 91 if not os.path.exists(self.model_dir): 92 os.mkdir(self.model_dir) 93 94 checkpoint = tf.train.get_checkpoint_state(self.model_dir) 95 if checkpoint and checkpoint.model_checkpoint_path: 96 self.saver.restore(self.sess, checkpoint.model_checkpoint_path) 97 print ("Loading Successfully") 98 self.learn_step_counter = int(checkpoint.model_checkpoint_path.split('-')[-1]) + 1 99 100 101 @abstractmethod 102 def _build_a_net(self,x,scope,trainable): 103 104 raise NotImplementedError 105 def _build_c_net(self,x,scope,trainable): 106 107 raise NotImplementedError 108 def learn(self,data): 109 110 111 112 113 batch_memory_s = data['s'] 114 batch_memory_a = data['a'] 115 batch_memory_r = data['r'] 116 batch_memory_s_ = data['s_'] 117 118 119 120 _, cost = self.sess.run( 121 [self.train_op_critic, self.loss_critic], 122 feed_dict={ 123 self.s: batch_memory_s, 124 self.a: batch_memory_a, 125 self.r: batch_memory_r, 126 self.s_next: batch_memory_s_, 127 128 }) 129 130 _, cost = self.sess.run( 131 [self.train_op_actor, self.loss_actor], 132 feed_dict={ 133 self.s: batch_memory_s, 134 self.a: batch_memory_a, 135 self.r: batch_memory_r, 136 self.s_next: batch_memory_s_, 137 138 }) 139 140 141 self.cost_his.append(cost) 142 143 self.learn_step_counter += 1 144 # save network every 100000 iteration 145 if self.learn_step_counter % 10000 == 0: 146 self.saver.save(self.sess,self.model_dir,global_step=self.learn_step_counter) 147 148 149 150 def choose_action(self,s): 151 s = s[np.newaxis,:] 152 153 probs = self.sess.run(self.acts_prob,feed_dict={self.s:s}) 154 return np.random.choice(np.arange(probs.shape[1]), p=probs.ravel())
game.py
1 import sys 2 import gym 3 import numpy as np 4 import tensorflow as tf 5 sys.path.append('./') 6 sys.path.append('model') 7 8 from util import Memory ,StateProcessor 9 from ACNetwork import ACNetwork 10 np.random.seed(1) 11 tf.set_random_seed(1) 12 13 import logging # 引入logging模塊 14 logging.basicConfig(level=logging.DEBUG, 15 format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s') # logging.basicConfig函數對日誌的輸出格式及方式作相關配置 16 # 因爲日誌基本配置中級別設置爲DEBUG,因此一下打印信息將會所有顯示在控制檯上 17 import os 18 os.environ["CUDA_VISIBLE_DEVICES"] = "1" 19 tfconfig = tf.ConfigProto() 20 tfconfig.gpu_options.allow_growth = True 21 session = tf.Session(config=tfconfig) 22 23 24 25 class ACNetwork4CartPole(ACNetwork): 26 """docstring for ClassName""" 27 def __init__(self, **kwargs): 28 super(ACNetwork4CartPole, self).__init__(**kwargs) 29 30 def _build_a_net(self,x,scope,trainable): 31 w_initializer, b_initializer = tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1) 32 33 with tf.variable_scope(scope): 34 e1 = tf.layers.dense(inputs=x, 35 units=32, 36 bias_initializer = b_initializer, 37 kernel_initializer=w_initializer, 38 activation = tf.nn.relu, 39 trainable=trainable) 40 q = tf.layers.dense(inputs=e1, 41 units=self.n_actions, 42 bias_initializer = b_initializer, 43 kernel_initializer=w_initializer, 44 activation = tf.nn.softmax, 45 trainable=trainable) 46 47 return q 48 49 def _build_c_net(self,x,scope,trainable): 50 w_initializer, b_initializer = tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1) 51 52 with tf.variable_scope(scope): 53 e1 = tf.layers.dense(inputs=x, 54 units=32, 55 bias_initializer = b_initializer, 56 kernel_initializer=w_initializer, 57 activation = tf.nn.relu, 58 trainable=trainable) 59 q = tf.layers.dense(inputs=e1, 60 units=1, 61 bias_initializer = b_initializer, 62 kernel_initializer=w_initializer, 63 activation =None, 64 trainable=trainable) 65 66 return q 67 68 69 70 batch_size = 32 71 72 memory_size =100 73 #env = gym.make('Breakout-v0') #離散 74 env = gym.make('CartPole-v0') #離散 75 76 77 n_features= list(env.observation_space.shape) 78 n_actions= env.action_space.n 79 env = env.unwrapped 80 81 def run(): 82 83 RL = ACNetwork4CartPole( 84 n_actions=n_actions, 85 n_features=n_features, 86 learning_rate=0.01, 87 reward_decay=0.9, 88 89 memory_size=memory_size, 90 91 output_graph=True, 92 log_dir = 'log/ACNetwork4CartPole/', 93 94 model_dir = 'model_dir/ACNetwork4CartPole/' 95 ) 96 97 memory = Memory(n_actions,n_features,memory_size=memory_size) 98 99 100 step = 0 101 ep_r = 0 102 for episode in range(2000): 103 # initial observation 104 observation = env.reset() 105 106 while True: 107 108 109 # RL choose action based on observation 110 action = RL.choose_action(observation) 111 # logging.debug('action') 112 # print(action) 113 # RL take action and get_collectiot next observation and reward 114 observation_, reward, done, info=env.step(action) # take a random action 115 116 # the smaller theta and closer to center the better 117 x, x_dot, theta, theta_dot = observation_ 118 r1 = (env.x_threshold - abs(x))/env.x_threshold - 0.8 119 r2 = (env.theta_threshold_radians - abs(theta))/env.theta_threshold_radians - 0.5 120 reward = r1 + r2 121 122 123 124 125 memory.store_transition(observation, action, reward, observation_) 126 127 128 if (step > 200) and (step % 1 == 0): 129 130 data = memory.sample(batch_size) 131 RL.learn(data) 132 #print('step:%d----reward:%f---action:%d'%(step,reward,action)) 133 # swap observation 134 observation = observation_ 135 ep_r += reward 136 # break while loop when end of this episode 137 if(episode>700): 138 env.render() # render on the screen 139 if done: 140 print('step: ',step, 141 'episode: ', episode, 142 'ep_r: ', round(ep_r, 2), 143 'loss: ',RL.cost_his[-1] 144 ) 145 ep_r = 0 146 147 break 148 step += 1 149 150 # end of game 151 print('game over') 152 env.destroy() 153 154 def main(): 155 156 run() 157 158 159 160 if __name__ == '__main__': 161 main() 162 #run2()