mirror of
https://github.com/gryf/coach.git
synced 2025-12-18 03:30:19 +01:00
network_imporvements branch merge
This commit is contained in:
@@ -31,9 +31,10 @@ from rl_coach.architectures.tensorflow_components.embedders.embedder import Inpu
|
||||
|
||||
from rl_coach.core_types import EnvironmentSteps, Batch
|
||||
from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters
|
||||
from rl_coach.exploration_policies.categorical import CategoricalParameters
|
||||
from rl_coach.logger import screen
|
||||
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
|
||||
from rl_coach.spaces import DiscreteActionSpace
|
||||
from rl_coach.spaces import DiscreteActionSpace, BoxActionSpace
|
||||
from rl_coach.utils import force_list
|
||||
|
||||
|
||||
@@ -43,7 +44,6 @@ class PPOCriticNetworkParameters(NetworkParameters):
|
||||
self.input_embedders_parameters = {'observation': InputEmbedderParameters(activation_function='tanh')}
|
||||
self.middleware_parameters = FCMiddlewareParameters(activation_function='tanh')
|
||||
self.heads_parameters = [VHeadParameters()]
|
||||
self.loss_weights = [1.0]
|
||||
self.async_training = True
|
||||
self.l2_regularization = 0
|
||||
self.create_target_network = True
|
||||
@@ -57,7 +57,6 @@ class PPOActorNetworkParameters(NetworkParameters):
|
||||
self.middleware_parameters = FCMiddlewareParameters(activation_function='tanh')
|
||||
self.heads_parameters = [PPOHeadParameters()]
|
||||
self.optimizer_type = 'Adam'
|
||||
self.loss_weights = [1.0]
|
||||
self.async_training = True
|
||||
self.l2_regularization = 0
|
||||
self.create_target_network = True
|
||||
@@ -84,7 +83,8 @@ class PPOAlgorithmParameters(AlgorithmParameters):
|
||||
class PPOAgentParameters(AgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__(algorithm=PPOAlgorithmParameters(),
|
||||
exploration=AdditiveNoiseParameters(),
|
||||
exploration={DiscreteActionSpace: CategoricalParameters(),
|
||||
BoxActionSpace: AdditiveNoiseParameters()},
|
||||
memory=EpisodicExperienceReplayParameters(),
|
||||
networks={"critic": PPOCriticNetworkParameters(), "actor": PPOActorNetworkParameters()})
|
||||
|
||||
@@ -313,6 +313,9 @@ class PPOAgent(ActorCriticAgent):
|
||||
def train(self):
|
||||
loss = 0
|
||||
if self._should_train(wait_for_full_episode=True):
|
||||
for network in self.networks.values():
|
||||
network.set_is_training(True)
|
||||
|
||||
for training_step in range(self.ap.algorithm.num_consecutive_training_steps):
|
||||
self.networks['actor'].sync()
|
||||
self.networks['critic'].sync()
|
||||
@@ -330,6 +333,9 @@ class PPOAgent(ActorCriticAgent):
|
||||
self.value_loss.add_sample(value_loss)
|
||||
self.policy_loss.add_sample(policy_loss)
|
||||
|
||||
for network in self.networks.values():
|
||||
network.set_is_training(False)
|
||||
|
||||
self.post_training_commands()
|
||||
self.training_iteration += 1
|
||||
self.update_log() # should be done in order to update the data that has been accumulated * while not playing *
|
||||
|
||||
Reference in New Issue
Block a user