From eb0b57d7fad541e59c9a0fce09ec1a9a6e11ec4b Mon Sep 17 00:00:00 2001 From: Gal Leibovich Date: Tue, 24 Oct 2017 16:57:44 +0300 Subject: [PATCH] Updating PPO references per issue #11 --- README.md | 4 ++-- agents/ppo_agent.py | 2 +- docs/docs/algorithms/policy_optimization/ppo.md | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index f8083fd..866ff5c 100644 --- a/README.md +++ b/README.md @@ -128,7 +128,7 @@ Framework documentation, algoritmic description and instructions on how to contr ## Parallelizing an Algorithm -Since the introduction of [A3C](https://arxiv.org/abs/1602.01783) in 2016, many algorithms were shown to benefit from running multiple instances in parallel, on many CPU cores. So far, these algorithms include [A3C](https://arxiv.org/abs/1602.01783), [DDPG](https://arxiv.org/pdf/1704.03073.pdf), [PPO](https://arxiv.org/abs/1707.02286), and [NAF](https://arxiv.org/pdf/1610.00633.pdf), and this is most probably only the begining. +Since the introduction of [A3C](https://arxiv.org/abs/1602.01783) in 2016, many algorithms were shown to benefit from running multiple instances in parallel, on many CPU cores. So far, these algorithms include [A3C](https://arxiv.org/abs/1602.01783), [DDPG](https://arxiv.org/pdf/1704.03073.pdf), [PPO](https://arxiv.org/pdf/1707.06347.pdf), and [NAF](https://arxiv.org/pdf/1610.00633.pdf), and this is most probably only the begining. Parallelizing an algorithm using Coach is straight-forward. @@ -203,7 +203,7 @@ python3 coach.py -p Hopper_A3C -n 16 * [Policy Gradients (PG)](http://www-anw.cs.umass.edu/~barto/courses/cs687/williams92simple.pdf) | **Distributed** * [Actor Critic / A3C](https://arxiv.org/abs/1602.01783) | **Distributed** * [Deep Deterministic Policy Gradients (DDPG)](https://arxiv.org/abs/1509.02971) | **Distributed** -* [Proximal Policy Optimization (PPO)](https://arxiv.org/pdf/1707.02286.pdf) +* [Proximal Policy Optimization (PPO)](https://arxiv.org/pdf/1707.06347.pdf) * [Clipped Proximal Policy Optimization](https://arxiv.org/pdf/1707.06347.pdf) | **Distributed** * [Direct Future Prediction (DFP)](https://arxiv.org/abs/1611.01779) | **Distributed** diff --git a/agents/ppo_agent.py b/agents/ppo_agent.py index ee1c84f..3ad2481 100644 --- a/agents/ppo_agent.py +++ b/agents/ppo_agent.py @@ -19,7 +19,7 @@ from random import shuffle import tensorflow as tf -# Proximal Policy Optimization - https://arxiv.org/pdf/1707.02286.pdf +# Proximal Policy Optimization - https://arxiv.org/pdf/1707.06347.pdf class PPOAgent(ActorCriticAgent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, diff --git a/docs/docs/algorithms/policy_optimization/ppo.md b/docs/docs/algorithms/policy_optimization/ppo.md index a4a4b97..10b6dab 100644 --- a/docs/docs/algorithms/policy_optimization/ppo.md +++ b/docs/docs/algorithms/policy_optimization/ppo.md @@ -2,7 +2,7 @@ **Actions space:** Discrete|Continuous -**References:** [Emergence of Locomotion Behaviours in Rich Environments](https://arxiv.org/pdf/1707.02286.pdf) +**References:** [Proximal Policy Optimization Algorithms](https://arxiv.org/pdf/1707.06347.pdf) ## Network Structure