coach v0.8.0

2026-04-29 12:04:10 +02:00 · 2017-10-19 13:10:15 +03:00
parent 7f77813a39
commit 1d4c3455e7
123 changed files with 10996 additions and 203 deletions
@@ -0,0 +1,87 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from agents.policy_optimization_agent import *
+import numpy as np
+from logger import *
+import tensorflow as tf
+import matplotlib.pyplot as plt
+
+from utils import *
+
+
+class PolicyGradientsAgent(PolicyOptimizationAgent):
+    def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
+        PolicyOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
+
+        self.last_gradient_update_step_idx = 0
+
+    def learn_from_batch(self, batch):
+        # batch contains a list of episodes to learn from
+        current_states, next_states, actions, rewards, game_overs, total_returns = self.extract_batch(batch)
+
+        for i in reversed(range(len(total_returns))):
+            if self.policy_gradient_rescaler == PolicyGradientRescaler.TOTAL_RETURN:
+                total_returns[i] = total_returns[0]
+            elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN:
+                # just take the total return as it is
+                pass
+            elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_EPISODE:
+                # we can get a single transition episode while playing Doom Basic, causing the std to be 0
+                if self.std_discounted_return != 0:
+                    total_returns[i] = (total_returns[i] - self.mean_discounted_return) / self.std_discounted_return
+                else:
+                    total_returns[i] = 0
+            elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP:
+                total_returns[i] -= self.mean_return_over_multiple_episodes[i]
+            else:
+                screen.warning("WARNING: The requested policy gradient rescaler is not available")
+
+        targets = total_returns
+        if not self.env.discrete_controls and len(actions.shape) < 2:
+            actions = np.expand_dims(actions, -1)
+
+        logger.create_signal_value('Returns Variance', np.std(total_returns), self.task_id)
+        logger.create_signal_value('Returns Mean', np.mean(total_returns), self.task_id)
+
+        result = self.main_network.online_network.accumulate_gradients([current_states, actions], targets)
+        total_loss = result[0]
+
+        return total_loss
+
+    def choose_action(self, curr_state, phase=RunPhase.TRAIN):
+        # convert to batch so we can run it through the network
+        observation = np.expand_dims(np.array(curr_state['observation']), 0)
+        if self.env.discrete_controls:
+            # DISCRETE
+            action_values = self.main_network.online_network.predict(observation).squeeze()
+            if phase == RunPhase.TRAIN:
+                action = self.exploration_policy.get_action(action_values)
+            else:
+                action = np.argmax(action_values)
+            action_value = {"action_probability": action_values[action]}
+            self.entropy.add_sample(-np.sum(action_values * np.log(action_values)))
+        else:
+            # CONTINUOUS
+            result = self.main_network.online_network.predict(observation)
+            action_values = result[0].squeeze()
+            if phase == RunPhase.TRAIN:
+                action = self.exploration_policy.get_action(action_values)
+            else:
+                action = action_values
+            action_value = {}
+
+        return action, action_value