• 姜96
    了解作者
  • Python
    开发工具
  • 2KB
    文件大小
  • zip
    文件格式
  • 0
    收藏次数
  • 10 积分
    下载积分
  • 1
    下载次数
  • 2021-04-26 15:29
    上传日期
使用深度强化学习中的ddpg算法学习玩游戏,让智能体学习最优策略。
ddpg.zip
  • ddpg.py
    5.8KB
内容介绍
""" Note: This is a updated version from my previous code, for the target network, I use moving average to soft replace target parameters instead using assign function. By doing this, it has 20% speed up on my machine (CPU). Deep Deterministic Policy Gradient (DDPG), Reinforcement Learning. DDPG is Actor Critic based algorithm. Pendulum example. View more on my tutorial page: https://morvanzhou.github.io/tutorials/ Using: tensorflow 1.0 gym 0.8.0 """ import tensorflow as tf import numpy as np import gym import time ##################### hyper parameters #################### MAX_EPISODES = 200 MAX_EP_STEPS = 200 LR_A = 0.001 # learning rate for actor LR_C = 0.002 # learning rate for critic GAMMA = 0.9 # reward discount TAU = 0.01 # soft replacement MEMORY_CAPACITY = 10000 BATCH_SIZE = 32 RENDER = False ENV_NAME = 'Pendulum-v0' ############################### DDPG #################################### class DDPG(object): def __init__(self, a_dim, s_dim, a_bound,): self.memory = np.zeros((MEMORY_CAPACITY, s_dim * 2 + a_dim + 1), dtype=np.float32) self.pointer = 0 self.sess = tf.Session() self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound, self.S = tf.placeholder(tf.float32, [None, s_dim], 's') self.S_ = tf.placeholder(tf.float32, [None, s_dim], 's_') self.R = tf.placeholder(tf.float32, [None, 1], 'r') self.a = self._build_a(self.S,) q = self._build_c(self.S, self.a, ) a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Actor') c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Critic') ema = tf.train.ExponentialMovingAverage(decay=1 - TAU) # soft replacement def ema_getter(getter, name, *args, **kwargs): return ema.average(getter(name, *args, **kwargs)) target_update = [ema.apply(a_params), ema.apply(c_params)] # soft update operation a_ = self._build_a(self.S_, reuse=True, custom_getter=ema_getter) # replaced target parameters q_ = self._build_c(self.S_, a_, reuse=True, custom_getter=ema_getter) a_loss = - tf.reduce_mean(q) # maximize the q self.atrain = tf.train.AdamOptimizer(LR_A).minimize(a_loss, var_list=a_params) with tf.control_dependencies(target_update): # soft replacement happened at here q_target = self.R + GAMMA * q_ td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q) self.ctrain = tf.train.AdamOptimizer(LR_C).minimize(td_error, var_list=c_params) self.sess.run(tf.global_variables_initializer()) def choose_action(self, s): return self.sess.run(self.a, {self.S: s[np.newaxis, :]})[0] def learn(self): indices = np.random.choice(MEMORY_CAPACITY, size=BATCH_SIZE) bt = self.memory[indices, :] bs = bt[:, :self.s_dim] ba = bt[:, self.s_dim: self.s_dim + self.a_dim] br = bt[:, -self.s_dim - 1: -self.s_dim] bs_ = bt[:, -self.s_dim:] self.sess.run(self.atrain, {self.S: bs}) self.sess.run(self.ctrain, {self.S: bs, self.a: ba, self.R: br, self.S_: bs_}) def store_transition(self, s, a, r, s_): transition = np.hstack((s, a, [r], s_)) index = self.pointer % MEMORY_CAPACITY # replace the old memory with new memory self.memory[index, :] = transition self.pointer += 1 def _build_a(self, s, reuse=None, custom_getter=None): trainable = True if reuse is None else False with tf.variable_scope('Actor', reuse=reuse, custom_getter=custom_getter): net = tf.layers.dense(s, 30, activation=tf.nn.relu, name='l1', trainable=trainable) a = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, name='a', trainable=trainable) return tf.multiply(a, self.a_bound, name='scaled_a') def _build_c(self, s, a, reuse=None, custom_getter=None): trainable = True if reuse is None else False with tf.variable_scope('Critic', reuse=reuse, custom_getter=custom_getter): n_l1 = 30 w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], trainable=trainable) w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], trainable=trainable) b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable) net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1) print("1111",tf.matmul(s, w1_s)) return tf.layers.dense(net, 1, trainable=trainable) # Q(s,a) ############################### training #################################### env = gym.make(ENV_NAME) env = env.unwrapped env.seed(1) s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] a_bound = env.action_space.high ddpg = DDPG(a_dim, s_dim, a_bound) var = 3 # control exploration t1 = time.time() for i in range(MAX_EPISODES): s = env.reset() ep_reward = 0 for j in range(MAX_EP_STEPS): if RENDER: env.render() # Add exploration noise a = ddpg.choose_action(s) # print("org a is", a) a = np.clip(np.random.normal(a, var), -2, 2) # add randomness to action selection for exploration # print("imp a is", a) s_, r, done, info = env.step(a) ddpg.store_transition(s, a, r / 10, s_) if ddpg.pointer > MEMORY_CAPACITY: var *= .9995 # decay the action randomness ddpg.learn() s = s_ ep_reward += r if j == MAX_EP_STEPS - 1: print('Episode:', i, ' Reward: %i' % int(ep_reward), 'Explore: %.2f' % var, ) # if ep_reward > -300:RENDER = True break print('Running time: ', time.time() - t1)
评论
    相关推荐
    • DDPG.zip
      DDPG例程序,人工智能来玩倒立摆游戏,简单易懂,tensorflow
    • DDPG.rar
      DDPG相关文章和代码,人工智能领域中强化学习经典算法
    • RLContinuousActionSpace:在连续状态和动作空间中进行强化学习DDPG
      在连续状态和动作空间中进行强化学习DDPG:深度确定性策略梯度和A3C:异步Actor-Critic代理 注意:环境是随机填充的查找表和模拟物理模型的常量的任意组合。 1°DDPG: 基于带有深度强化学习的持续控制: : 和...
    • ddpg:强化学习ddpg代码。 关注思想论文
      ddpg DDPG(深度确定性策略梯度)在Gym-torcs上的实现。 与张量流。 ddpg论文: ://arxiv.org/pdf/1509.02971v2.pdf 作者:肯尼斯·于 安装依赖项: 张量流r1.4 gym_torcs: : 如何运行: 训练方式: ...
    • RL-Taxonomy:强化学习算法的宽松分类法
      这是强化学习算法的宽松分类法。我绝不是这方面的专家,我正在将其作为学习过程的一部分。请注意,这里列出的算法比这里列出的要多得多,而且我什至不知道如何对它们进行分类。无论如何,请PR改正或提出新建议。 请...
    • 强化学习算法Pytorch实现全家桶
      pytorch框架,主要实现算法有Q-Learning,Sarsa,DQN,DQN-cnn,DoubleDQN,Hierarchical DQN,PG,A2C,SAC,PPO,DDPG,TD3等,能够满足GPU和CPU不同条件,实现模型保存,断点续训,测试结果绘图等,可在此框架...
    • 深度强化学习必读文献
      深度强化学习的入门经典文献: 包括DQN、DDPG、A3C/A2C/PPO/ACKTR等; 包括模仿学习,也包括几篇综述性的文章。
    • ddpg.zip
      The python implementation of reinforcement algorithm DDPG.
    • snowglobe-src-viewer-2.0.0-r0.tar.gz
      国外的开源游戏引擎,能够承载上千人,属于社交类的游戏