[docs]defapply_gradients_to_global_network(self,gradients=None,additional_inputs=None):""" Apply gradients from the online network on the global network :param gradients: optional gradients that will be used instead of teh accumulated gradients
+ :param additional_inputs: optional additional inputs required for when applying the gradients (e.g. batchnorm's
+ update ops also requires the inputs) :return: """ifgradientsisNone:gradients=self.online_network.accumulated_gradientsifself.network_parameters.shared_optimizer:
- self.global_network.apply_gradients(gradients)
+ self.global_network.apply_gradients(gradients,additional_inputs=additional_inputs)else:
- self.online_network.apply_gradients(gradients)
[docs]defapply_gradients_to_online_network(self,gradients=None,additional_inputs=None):""" Apply gradients from the online network on itself
+ :param gradients: optional gradients that will be used instead of teh accumulated gradients
+ :param additional_inputs: optional additional inputs required for when applying the gradients (e.g. batchnorm's
+ update ops also requires the inputs) :return: """ifgradientsisNone:gradients=self.online_network.accumulated_gradients
- self.online_network.apply_gradients(gradients)
[docs]deftrain_and_sync_networks(self,inputs,targets,additional_fetches=[],importance_weights=None,
+ use_inputs_for_apply_gradients=False):""" A generic training function that enables multi-threading training using a global network if necessary.
@@ -340,14 +346,20 @@
:param additional_fetches: Any additional tensor the user wants to fetch :param importance_weights: A coefficient for each sample in the batch, which will be used to rescale the loss error of this sample. If it is not given, the samples losses won't be scaled
+ :param use_inputs_for_apply_gradients: Add the inputs also for when applying gradients
+ (e.g. for incorporating batchnorm update ops) :return: The loss of the training iteration """result=self.online_network.accumulate_gradients(inputs,targets,additional_fetches=additional_fetches,importance_weights=importance_weights,no_accumulation=True)
- self.apply_gradients_and_sync_networks(reset_gradients=False)
+ ifuse_inputs_for_apply_gradients:
+ self.apply_gradients_and_sync_networks(reset_gradients=False,additional_inputs=inputs)
+ else:
+ self.apply_gradients_and_sync_networks(reset_gradients=False)
+
returnresult
[docs]defapply_gradients_and_sync_networks(self,reset_gradients=True,additional_inputs=None):""" Applies the gradients accumulated in the online network to the global network or to itself and syncs the networks if necessary
@@ -356,17 +368,22 @@
the network. this is useful when the accumulated gradients are overwritten instead if accumulated by the accumulate_gradients function. this allows reducing time complexity for this function by around 10%
+ :param additional_inputs: optional additional inputs required for when applying the gradients (e.g. batchnorm's
+ update ops also requires the inputs)
+
"""ifself.global_network:
- self.apply_gradients_to_global_network()
+ self.apply_gradients_to_global_network(additional_inputs=additional_inputs)ifreset_gradients:self.online_network.reset_accumulated_gradients()self.update_online_network()else:ifreset_gradients:
- self.online_network.apply_and_reset_gradients(self.online_network.accumulated_gradients)
+ self.online_network.apply_and_reset_gradients(self.online_network.accumulated_gradients,
+ additional_inputs=additional_inputs)else:
- self.online_network.apply_gradients(self.online_network.accumulated_gradients)
[docs]defparallel_prediction(self,network_input_tuples:List[Tuple]):"""
diff --git a/docs/_modules/rl_coach/environments/gym_environment.html b/docs/_modules/rl_coach/environments/gym_environment.html
index ef1954d..762cf94 100644
--- a/docs/_modules/rl_coach/environments/gym_environment.html
+++ b/docs/_modules/rl_coach/environments/gym_environment.html
@@ -213,7 +213,7 @@
failed_imports.append("RoboSchool")try:
- fromrl_coach.gym_extensions.continuousimportmujoco
+ fromgym_extensions.continuousimportmujocoexcept:fromrl_coach.loggerimportfailed_importsfailed_imports.append("GymExtensions")
@@ -575,9 +575,6 @@
else:screen.error("Error: Environment {} does not support human control.".format(self.env),crash=True)
- # initialize the state by getting a new state from the environment
- self.reset_internal_state(True)
-
# renderifself.is_rendered:image=self.get_rendered_image()
@@ -588,7 +585,6 @@
self.renderer.create_screen(image.shape[1]*scale,image.shape[0]*scale)# the info is only updated after the first step
- self.state=self.step(self.action_space.default_action).next_stateself.state_space['measurements']=VectorObservationSpace(shape=len(self.info.keys()))ifself.env.specandcustom_reward_thresholdisNone:
diff --git a/docs/_modules/rl_coach/filters/reward/reward_normalization_filter.html b/docs/_modules/rl_coach/filters/reward/reward_normalization_filter.html
index 7dc34ba..5bb904f 100644
--- a/docs/_modules/rl_coach/filters/reward/reward_normalization_filter.html
+++ b/docs/_modules/rl_coach/filters/reward/reward_normalization_filter.html
@@ -247,15 +247,14 @@
deffilter(self,reward:RewardType,update_internal_state:bool=True)->RewardType:ifupdate_internal_state:
+ ifnotisinstance(reward,np.ndarray)orlen(reward.shape)<2:
+ reward=np.array([[reward]])self.running_rewards_stats.push(reward)
- reward=(reward-self.running_rewards_stats.mean)/ \
- (self.running_rewards_stats.std+1e-15)
- reward=np.clip(reward,self.clip_min,self.clip_max)
-
- returnreward
+ returnself.running_rewards_stats.normalize(reward).squeeze()defget_filtered_reward_space(self,input_reward_space:RewardSpace)->RewardSpace:
+ self.running_rewards_stats.set_params(shape=(1,),clip_values=(self.clip_min,self.clip_max))returninput_reward_spacedefsave_state_to_checkpoint(self,checkpoint_dir:str,checkpoint_prefix:str):
diff --git a/docs/_modules/rl_coach/memories/episodic/episodic_experience_replay.html b/docs/_modules/rl_coach/memories/episodic/episodic_experience_replay.html
index 2070f69..b0687e8 100644
--- a/docs/_modules/rl_coach/memories/episodic/episodic_experience_replay.html
+++ b/docs/_modules/rl_coach/memories/episodic/episodic_experience_replay.html
@@ -198,6 +198,8 @@
# limitations under the License.#importast
+
+importpicklefromcopyimportdeepcopyimportmath
@@ -324,14 +326,27 @@
defshuffle_episodes(self):"""
- Shuffle all the episodes in the replay buffer
+ Shuffle all the complete episodes in the replay buffer, while deleting the last non-complete episode :return: """
+ self.reader_writer_lock.lock_writing()
+
self.assert_not_frozen()
+ # unlike the standard usage of the EpisodicExperienceReplay, where we always leave an empty episode after
+ # the last full one, so that new transitions will have where to be added, in this case we delibrately remove
+ # that empty last episode, as we are about to shuffle the memory, and we don't want it to be shuffled in
+ self.remove_last_episode(lock=False)
+
random.shuffle(self._buffer)self.transitions=[tforeinself._bufferfortine.transitions]
+ # create a new Episode for the next transitions to be placed into
+ self._buffer.append(Episode(n_step=self.n_step))
+ self._length+=1
+
+ self.reader_writer_lock.release_writing()
+
defget_shuffled_training_data_generator(self,size:int)->List[Transition]:""" Get an generator for iterating through the shuffled replay buffer, for processing the data in epochs.
@@ -384,10 +399,10 @@
granularity,size=self.max_sizeifgranularity==MemoryGranularity.Transitions:whilesize!=0andself.num_transitions()>size:
- self._remove_episode(0)
+ self.remove_first_episode(lock=False)elifgranularity==MemoryGranularity.Episodes:whileself.length()>size:
- self._remove_episode(0)
+ self.remove_first_episode(lock=False)def_update_episode(self,episode:Episode)->None:episode.update_transitions_rewards_and_bootstrap_data()
@@ -504,31 +519,53 @@
def_remove_episode(self,episode_index:int)->None:"""
- Remove the episode in the given index (even if it is not complete yet)
- :param episode_index: the index of the episode to remove
+ Remove either the first or the last index
+ :param episode_index: the index of the episode to remove (either 0 or -1) :return: None """self.assert_not_frozen()
+ assertepisode_index==0orepisode_index==-1,"_remove_episode only supports removing the first or the last " \
+ "episode"
- iflen(self._buffer)>episode_index:
+ iflen(self._buffer)>0:episode_length=self._buffer[episode_index].length()self._length-=1self._num_transitions-=episode_lengthself._num_transitions_in_complete_episodes-=episode_length
- delself.transitions[:episode_length]
+ ifepisode_index==0:
+ delself.transitions[:episode_length]
+ else:# episode_index = -1
+ delself.transitions[-episode_length:]delself._buffer[episode_index]
- defremove_episode(self,episode_index:int)->None:
+ defremove_first_episode(self,lock:bool=True)->None:"""
- Remove the episode in the given index (even if it is not complete yet)
- :param episode_index: the index of the episode to remove
+ Remove the first episode (even if it is not complete yet)
+ :param lock: if true, will lock the readers writers lock. this can cause a deadlock if an inheriting class
+ locks and then calls store with lock = True :return: None """
- self.reader_writer_lock.lock_writing_and_reading()
+ iflock:
+ self.reader_writer_lock.lock_writing_and_reading()
- self._remove_episode(episode_index)
+ self._remove_episode(0)
+ iflock:
+ self.reader_writer_lock.release_writing_and_reading()
- self.reader_writer_lock.release_writing_and_reading()
+ defremove_last_episode(self,lock:bool=True)->None:
+ """
+ Remove the last episode (even if it is not complete yet)
+ :param lock: if true, will lock the readers writers lock. this can cause a deadlock if an inheriting class
+ locks and then calls store with lock = True
+ :return: None
+ """
+ iflock:
+ self.reader_writer_lock.lock_writing_and_reading()
+
+ self._remove_episode(-1)
+
+ iflock:
+ self.reader_writer_lock.release_writing_and_reading()# for API compatibilitydefget(self,episode_index:int,lock:bool=True)->Union[None,Episode]:
@@ -555,15 +592,6 @@
returnepisode
- # for API compatibility
- defremove(self,episode_index:int):
- """
- Remove the episode in the given index (even if it is not complete yet)
- :param episode_index: the index of the episode to remove
- :return: None
- """
- self.remove_episode(episode_index)
-
defclean(self)->None:""" Clean the memory by removing all the episodes
@@ -629,7 +657,7 @@
transitions.append(Transition(state={'observation':state},
- action=current_transition['action'],reward=current_transition['reward'],
+ action=int(current_transition['action']),reward=current_transition['reward'],next_state={'observation':next_state},game_over=False,info={'all_action_probabilities':ast.literal_eval(current_transition['all_action_probabilities'])}),
@@ -698,7 +726,40 @@
episode_num,episode=self.get_episode_for_transition(transition)self.last_training_set_episode_id=episode_numself.last_training_set_transition_id= \
- len([tforeinself.get_all_complete_episodes_from_to(0,self.last_training_set_episode_id+1)fortine])
+ len([tforeinself.get_all_complete_episodes_from_to(0,self.last_training_set_episode_id+1)fortine])
+
+ defsave(self,file_path:str)->None:
+ """
+ Save the replay buffer contents to a pickle file
+ :param file_path: the path to the file that will be used to store the pickled transitions
+ """
+ withopen(file_path,'wb')asfile:
+ pickle.dump(self.get_all_complete_episodes(),file)
+
+ defload_pickled(self,file_path:str)->None:
+ """
+ Restore the replay buffer contents from a pickle file.
+ The pickle file is assumed to include a list of transitions.
+ :param file_path: The path to a pickle file to restore
+ """
+ self.assert_not_frozen()
+
+ withopen(file_path,'rb')asfile:
+ episodes=pickle.load(file)
+ num_transitions=sum([len(e.transitions)foreinepisodes])
+ ifnum_transitions>self.max_size[1]:
+ screen.warning("Warning! The number of transition to load into the replay buffer ({}) is "
+ "bigger than the max size of the replay buffer ({}). The excessive transitions will "
+ "not be stored.".format(num_transitions,self.max_size[1]))
+
+ progress_bar=ProgressBar(len(episodes))
+ forepisode_idx,episodeinenumerate(episodes):
+ self.store_episode(episode)
+
+ # print progress
+ progress_bar.update(episode_idx)
+
+ progress_bar.close()
diff --git a/docs/_modules/rl_coach/memories/non_episodic/experience_replay.html b/docs/_modules/rl_coach/memories/non_episodic/experience_replay.html
index 7ca76dd..3956160 100644
--- a/docs/_modules/rl_coach/memories/non_episodic/experience_replay.html
+++ b/docs/_modules/rl_coach/memories/non_episodic/experience_replay.html
@@ -381,15 +381,6 @@
"""returnself.get_transition(transition_index,lock)
- # for API compatibility
- defremove(self,transition_index:int,lock:bool=True):
- """
- Remove the transition in the given index
- :param transition_index: the index of the transition to remove
- :return: None
- """
- self.remove_transition(transition_index,lock)
-
defclean(self,lock:bool=True)->None:""" Clean the memory by removing all the episodes
diff --git a/docs/_sources/features/batch_rl.rst.txt b/docs/_sources/features/batch_rl.rst.txt
new file mode 100644
index 0000000..e7dea34
--- /dev/null
+++ b/docs/_sources/features/batch_rl.rst.txt
@@ -0,0 +1,18 @@
+Batch Reinforcement Learning
+============================
+
+Coach supports Batch Reinforcement Learning, where learning is based solely on a (fixed) batch of data.
+In Batch RL, we are given a dataset of experience, which was collected using some (one or more) deployed policies, and we would
+like to use it to learn a better policy than what was used to collect the dataset.
+There is no simulator to interact with, and so we cannot collect any new data, meaning we often cannot explore the MDP any further.
+To make things even harder, we would also like to use the dataset in order to evaluate the newly learned policy
+(using off-policy evaluation), since we do not have a simulator which we can use to evaluate the policy on.
+Batch RL is also often beneficial in cases where we just want to separate the inference (data collection) from the
+training process of a new policy. This is often the case where we have a system on which we could quite easily deploy a policy
+and collect experience data, but cannot easily use that system's setup to online train a new policy (as is often the
+case with more standard RL algorithms).
+
+Coach supports (almost) all of the integrated off-policy algorithms with Batch RL.
+
+A lot more details and example usage can be found in the
+`tutorial `_.
\ No newline at end of file
diff --git a/docs/_sources/features/index.rst.txt b/docs/_sources/features/index.rst.txt
index 3661755..71931fb 100644
--- a/docs/_sources/features/index.rst.txt
+++ b/docs/_sources/features/index.rst.txt
@@ -7,4 +7,5 @@ Features
algorithms
environments
- benchmarks
\ No newline at end of file
+ benchmarks
+ batch_rl
\ No newline at end of file
diff --git a/docs/components/architectures/index.html b/docs/components/architectures/index.html
index 6a4c5b2..7248e08 100644
--- a/docs/components/architectures/index.html
+++ b/docs/components/architectures/index.html
@@ -544,26 +544,34 @@ multi-process distributed mode. The network wrapper contains functionality for m
between them.
Applies the gradients accumulated in the online network to the global network or to itself and syncs the
networks if necessary
Parameters
-
reset_gradients – If set to True, the accumulated gradients wont be reset to 0 after applying them to
+
+
reset_gradients – If set to True, the accumulated gradients wont be reset to 0 after applying them to
the network. this is useful when the accumulated gradients are overwritten instead
if accumulated by the accumulate_gradients function. this allows reducing time
-complexity for this function by around 10%
+complexity for this function by around 10%
+
additional_inputs – optional additional inputs required for when applying the gradients (e.g. batchnorm’s
+update ops also requires the inputs)
Apply gradients from the online network on itself
+:param gradients: optional gradients that will be used instead of teh accumulated gradients
+:param additional_inputs: optional additional inputs required for when applying the gradients (e.g. batchnorm’s
+
+
update ops also requires the inputs)
+
Returns
@@ -650,7 +663,7 @@ target_network or global_network) and the second element is the inputs
A generic training function that enables multi-threading training using a global network if necessary.
Parameters
@@ -660,6 +673,8 @@ target_network or global_network) and the second element is the inputs
additional_fetches – Any additional tensor the user wants to fetch
importance_weights – A coefficient for each sample in the batch, which will be used to rescale the loss
error of this sample. If it is not given, the samples losses won’t be scaled
+
use_inputs_for_apply_gradients – Add the inputs also for when applying gradients
+(e.g. for incorporating batchnorm update ops)
Coach supports Batch Reinforcement Learning, where learning is based solely on a (fixed) batch of data.
+In Batch RL, we are given a dataset of experience, which was collected using some (one or more) deployed policies, and we would
+like to use it to learn a better policy than what was used to collect the dataset.
+There is no simulator to interact with, and so we cannot collect any new data, meaning we often cannot explore the MDP any further.
+To make things even harder, we would also like to use the dataset in order to evaluate the newly learned policy
+(using off-policy evaluation), since we do not have a simulator which we can use to evaluate the policy on.
+Batch RL is also often beneficial in cases where we just want to separate the inference (data collection) from the
+training process of a new policy. This is often the case where we have a system on which we could quite easily deploy a policy
+and collect experience data, but cannot easily use that system’s setup to online train a new policy (as is often the
+case with more standard RL algorithms).
+
Coach supports (almost) all of the integrated off-policy algorithms with Batch RL.
+
A lot more details and example usage can be found in the
+tutorial.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/features/index.html b/docs/features/index.html
index ebb9bee..72b6934 100644
--- a/docs/features/index.html
+++ b/docs/features/index.html
@@ -95,6 +95,7 @@
diff --git a/docs/index.html b/docs/index.html
index 7124f5c..b08f865 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -210,6 +210,7 @@ Coach collects statistics from the training process and supports advanced visual
diff --git a/docs/objects.inv b/docs/objects.inv
index c23977828b6ab3e40529690326049809281e80ec..b2a5dbc17ad41c3036f18db4f4468a41fdf0cace 100644
GIT binary patch
delta 3163
zcmV-h45ahi9PS*D!+!xF(C9`t+B^v(hN9gZZ86Qo97ULi>hJmG_08jDvSpm)yL5>-
zN^y?42(tZOQG$dB{uM+l%@~z0e$wxM%2-08eaNz$q~tfs38SAR=6si@$sd0P7c+jq
zm@^(mz(S&uSr%QZ5>FMd&}&Bd(+*5%jNeYekxi0VP~9Di&wmArP?CfxP8r{O1l=(v
za0nHJ{_zOQk_1P2DA0@fP@Zr(MmY{C&R-dS_6WS;82`rJyLXH8FylBTs(*!FV3Etd
z!08&tF`-+3;5$U)1duZ#SPZ}6&DbA0p(F>)7s44$p&*QLCMn>rl*L7YgPmRAJ3HAxU4RaO>m|@tXAYf~O`HRPT
zW{4XJ?Fny(gwN_Rk%u`)F!1PBj<6KwlE?7{!bJ9f9$euD7ED4n7`i0#*a(idM8FCg
z3bLgr@d+$IZs8Y{6ly`%BFr~Fp-^|I_Xy~qj^Hrb;eY5kW5nk$!LvrOGrKi*gYOEevOCR*dK`21TNgNXnstoZd#3a61kGz5wSqO4`
zi@8VCJ&pXKDaq&$uE=7k0zsR-V}kt4U<_1B&=?%h8&nV_2-kQcMGI1zoLq=O&L?Oe
zu8Ym)+<(uxF3qW{YiBD8Y^Pk?PHT(p6hUhwFqkM|{EHg8>$-IDo=jD
zp=@UJATXP$>A
zG&@6e8OqKe9mcUc#(_cX4sqNFcE>!?@O1{Z8M_0(jvBZFF)c>z5Ja0H>yCGl@j3|B
z$$wyVX0REl?qH`Grh{;g7^8zQyA04FK;1^CJJNeY(;4B=xO9f-F(`-P95W(^;tn$$
zUD!MrYgPj-O-rNnp1k4so`HC~Mm%9Ym;t<>nd-0d%g{fp;Od7DRt;<+Mw^oJ^41F>
z+F00d*})Gt*IwALrBgFh+S|*^Z$8)n(0{_F*O@IxF_8r&wJN?drC#h|n>AdU{l4sd
ztGWM+H?~%O6o}@azCPkD*J9k$q}43_wU8oq&0ZT-dDJDI_zix41q9pVVMnOI3d((l
z3kX}hKpuZK)K81gUg!X^i;hP-lEb23z?W`a%`Y1leRR#Y`!@!=A@mcIo)|>onUji~7nZmcuXJ*6<^^$r)(H-kIGB$26_f35
zs&6Tc0r;9pxve!3>y%`AV%8!eo_fBB4;oBj>cON8olq+O))12coTaJ#%0c2`K7Y{H
z|L9APYmA-fy}VU|UmA$VBfrM43xC*H+k~w_<25gSY+V>l2*n>zm+_XTBgF4zsm+V-
zpIvRoh-!OtN@klY-anm|G9Nkf%x=erjqq}}1#zh_c;R0ofg3x6Iv(I;zx0Wq3j#VE
zSr@PdKOyma96pZKW)g_B2Fh_B(_P2$2X)O-DHMOBgv81|8m$?l_R-%&6o1lB=C=uv
z-1E!#8I0jeB>=zD4eP;dJZlBIDwT%wjrMT(K~d)UhJSGFMK+#}v)kIhh8m6qJvJ)XLb+MoeGw4&{513<4!f`3B=kNT{
z=&>>d&*>-7{IQD3rhmE)q@HLJNDI~<%XZ&huJ87vtk%#&JmP4R|GUMug5qmc?2Gg*
zOBmNtrYf3%NS~3~KYGC-WklflrMY~NSAAq1IBukCmgK)vMCtsh{A)xmmWy2N&&EJk
zUyR1sICIoI(KH?(sI7z+tN=SoDzMpJX|lTLMwg66F5@~dj(_*_n>+P;Jfio7Kv@^*
zNUjehpGU9+Z*gc}pQfowt+Re`ML7Od5Ln_$4O%(b#fMi4mcQ}pU?#-K^MMs%##w}g
zn1BDd)PIdeR=F0;GEuoD#$6|JUeHJdhET>+5T{RN!GpRUjkt)`^PksOKJfJ!1YvCB
z`DLC1h77+X@_!IeZu~qwOy=cJtjMz>4@<10ELZGpZ2Ws9o{cx#Bk*mJB&OzA`>9q^
zbG%2HXQFR}V#J{xPfRt&H$Kb|dE9N>w&){Z>I2We*MF-+#qe!z)niA<2tAJrw2LWt
z?%4~zd?R8CE-hTl%$b7l@$Tx0{cjQ`92aURUhP-KT7M2w@c@#18fpPrFkX@0_-cQv
z0z0qxv<9)M_z|H?LbXX2f|R
zP^XWT84F(;(|9fe*aQ&4&t3>1}3|}6o54^2p8jv3OYcMXWuy*E*uLU%@`gz<9L7a!U%9Lot6Y)|gY!7e&~-G5>E>
z-ow*^=_OOUuw?+)s@#(8fHM!Cf}oPxl;*umZGT|f>2<;!yRB
z4@7~|EW!Se!dH^-LR);;QOwe^Jpc28MRY#MwS2Uo@%cy{M$|y0I~mI72SCNlN-2Dg
zb55f3@qEssChVEhr~wt57HdDa&3)tms{OJmi;c|Fpj@UZgxux`bHSH@s=~ey{Roo=
z;D33t88ha-NOH2ma+y4T{(7p@dziB)$pN6BF>(NV-bh)3)`<8$
z%7mwWJp0|VJKkq+`{?4ovE0`f*uBTX`U8>2f5j{cE>1iG5L{Nbg8C%E%$~HdkEJKQ
zV{nCc=mm6n;J8khJmEAuk|6O^hBB98a2m~M)sC;_?ckeC_SHkD#(|xt(vbsoq^7G|
zlgSD~0WOma3oC!f=iZgy`wPKbot2XD1?uAmP9J|rCzM?=47$gs`oV``kUc2iysfOAvt-S
zYtWU`H)sahoLrNb6*9VZNG0l5jX+S>4%9<9D>69*eguE!G^IF^vA~H+2sebee%I7M
z7o6VSG>x2D%dA|1%J>zilwX0$`89LMujNeJNXr#GT*z!+(AO4aqdU=-JFQ916_joY
zX!2$JfpmS6$U{YeI#tv{J&bMbf$a%`4I1IIZQ$N8o<0J5
z+)vPmG}B8mX}7bW3=si;wzd&0?{6^Ds}d^HAGU^l$89srs_Pm;=k?}ECqQ}A*eNPe
z#Uqw`;yl9YAVsBB)bC?NTH&)^tKvx66I6^16m-EF$;q!
zo53WmU#LE8Sz5|^_;YrE+jye8f6SLlK@&Wr$o3cI4I@F-zX~uuWCBZ7r2U32GvZ9&
zU&3gCXO$xzYz#!h>hm{i+Wka{WbOrs)`Mjx^A|TEs(i-5#q3}dzaPM}{{xOEw*|BA
BDpUXf
delta 3145
zcmV-P47T&`9NZj`!+(KzG`i7^Hc!Hcp=dWpTTF8?M-k?s`g?wPee-ykY#Ar{E?pvy
zQk-Khf^7d+lprC3e+3asGe+f$pY;2mGM12NAF?bbDfx|Z!ssW7Ip5`J^2eXS#f%>?
z=8T6Cu#l)^mPOa9#8U+<^qNurv;z|wXhwCIzj&-?
zhPaW?p73@^_^ciid6;tq1CMUy2uoouc^qFLOk@k_!4+;`!6bBpp-Cc-AHngK2v}i5
zLADenK7j?uE&PI#LM_Nzg!#rN6zUH39swQH5gbN49DhA$jQAXeyr3c5Y(k^8)J^1<
z0)|hycuM&p`*@@=Bv<}XT1t(e#_Ex^^g(|dgaU+|#4+KZ$`GGIOyZmM$SY`(g&@bb
zn0rLs)5ss1l8pY~iY%5Y5PY+DOpt#WjDboC8iNB`g9@Sq;TmtGXhBMolM6A(`2_95
zb+Or;+kc(w(ww@wcDACxcFMKww6@qz5qymV1`{QWe^EnsU6(F?JUCNJ5s2IN+dCdh
zg-B9X;_J6FJ{oW=Bninf9EVxPN-|xtTz>aru10{|PantG*6=PC%R4WO64S!C$x6c8
zr86JPcbIF{hou`b}tO
zD4W?l2+U?GI|CjykDU=onsEmD634HI$I=YO$(e|5M)n|lSHs$w#A-~tqa8J%2SD~2
z&CXC=hO#qAhjHwVabOU;LmW4P-7!xze4T-9#_j;HqXzCkOp8%F1kq;5y5pT>ybgkO
zGJjZ|8Ei(XJJ@N4=^)%A#^@l-E(3H3P`AbFDs(~%UXj5`t-g+TK
z8w(pQ8~EYo+6x=DG-`%QdwY5L%?BF*T7TH|IgOw)QdfAvxaN4-
z>Pu-1KyN1Hw$?iHr*XfTPX2a_^%LaF>)Lrey6mZtVA2Z@LI{6Sy;
zqc1(KF?OQ&@>U6cX&@et{2IS5V1Hw66SfA8*Sz?#bzw9i6n{Wn##^3_5WknDHZQt=
zcC{TNs_o4wnQg9k|8!c)eB{hCyB!}k!pq$j#HGIAg@26%ZtM)|cz~1r(kFs027v
zJ->XP!5F?&0`M!{upZ3DvsR$1QkggxV~*_6&Te2^XoZm3mKaAoh7wOu#5B4eq!5WP
zftl+E(QI?ZL`1BhQXALKWlx_@@tg6gdjj-YJFzP^oBrr%M=kTK(=i2cS|YR8DzNN;e{`ujdc9s|8%GvWe
zHwjjsec(4d-!dMQC#dNRpG)5!x~U6#n3^ajQZbrymV}}pGQz7#sekXRcdnJSRwkut39*6Ad;xP$!EM5({M^<
zcN!)Te=#(t{UE7lN*Qj*L4=bO4CDzonGC<0W;VDS!Su(f^dh(^ootm>I4-62{GC4<
zJyxdRIsF8hKUOi>RDaij)DuktX~Ftq+3wrR_1%7y)f!reM;vYPf4A6HP<*Y5eUZLp
z3FA7-R7DdI=`&LMM=v;}j0ilxG?x$Zs*kJ#$BlH&lKgjyD4k!Ge~rk+a*?b3*%;{R
zi_sVxXO5aDn#SV;wUy9<6<|k61vc9&O;#7(=#tUMWn2fw@qd1PbEkffNA$iBDC;5}
z$@QV+^9YvUEe`GL(==77b=D8A2*@;6=`%!C+uKCmLpIE%0l
z^Y1^G`mfQ*D%XNpCMvhYxa&mD3mU1w5XyK8;`FI3cu?1)5f{;V{`30E2fjXoAdGE1
zzsz&Mkl~j^9)AMLji0B7$-MlD6?s>%Y~ZV(6P&_1FbEY7Cyt{g0|C@vf$Aub-SNm15mVbj(Jb)yhhFX9Yj9270zS`fa
zfb0orpO_?`f^hY#kjI)Qp~_M8JkdmdVn4sq{6Ai)v?pc$sD6ziwOS10&FdD88F5|+
z)aheo#zIeH8qZ|_n*bvCxh&Q&hp+UF9v*Xg*}UDarBpn6|Byt+|3xBDpE2B_5q?zX
z#B4-1J%6a5;mhN
zi@xMU9)JO%Jo0D!SUfEJBG#VUYaP+ruV5c#U_9Dkxg`b5p_o4xYs@L=iy~~^nE$sb
z@8M~|^pdGv*fIcYRc^_4z?lb6K~PC;O7mW(Hh-|~^g7`(nj_L56y}0Ogxq-`aj06x
z2cke}mSBHK;Va2^p)Ee_C}wF{p5MJ-5uFcmEgvmtd_Gc#5j7C$PKNUN0Z=isQVQSW
zoRjE$JfHKZ347)=YCy&2i?tuz=00)&)qYu(#YX07P%cvyLT+<}x!_AcRbgL4DK$I_GD
zF}T7z^a8p(a9k%$o^YBSNsxFdLzzo4xEf}(`i`&V?ckeC_SHkD#(|xt(vbsoq^7G`
zlgSD~0Vb0S3oCy}ckjyn{z5QUXQgC(f$IFgsq=?4LfHhvzIV_Ns78Q?o)<$Jyv*wn
z$fO}dW?3G8c4sZqdjwCfgu-Y`&>lwrprO~1IAtwlP8NHudS5f5zZp)m`0+3sl9SiD
z232n2QQKs|)BB9lYlM__+WQ;HKA3!JEga6_2ucTEj+
z!Rgr{>{uW-fDg1xrv*Wo}KH
zH$L61>2+&9D(5OR(pUv$uB@O=mUa3-H~FZXE6~hi6_^xRGy9R~K8~Kl(?K&ONNXI`
zCyja_-yKDQ5doU*lf0hUPW|+tb)OCj%uz!1e^e295C9HgInkPalCj
z?k8wOn&~B(w7;{U3=si;Z*3!3-rr!PS0z-YKWq*Aj@xFKRo6Ad8q;W(BAH;O3!=VV
zZc~-lHAHj8=mcq<(>PpAs^<>mLHmx8W8wl6wUDG>;Sj%#3Pnl{5-`_.
\ No newline at end of file
diff --git a/docs_raw/source/features/index.rst b/docs_raw/source/features/index.rst
index 3661755..71931fb 100644
--- a/docs_raw/source/features/index.rst
+++ b/docs_raw/source/features/index.rst
@@ -7,4 +7,5 @@ Features
algorithms
environments
- benchmarks
\ No newline at end of file
+ benchmarks
+ batch_rl
\ No newline at end of file
diff --git a/rl_coach/architectures/head_parameters.py b/rl_coach/architectures/head_parameters.py
index 1c64b63..ee607dd 100644
--- a/rl_coach/architectures/head_parameters.py
+++ b/rl_coach/architectures/head_parameters.py
@@ -50,43 +50,51 @@ class PPOHeadParameters(HeadParameters):
class VHeadParameters(HeadParameters):
def __init__(self, activation_function: str ='relu', name: str='v_head_params',
num_output_head_copies: int = 1, rescale_gradient_from_head_by_factor: float = 1.0,
- loss_weight: float = 1.0, dense_layer=None, initializer='normalized_columns'):
+ loss_weight: float = 1.0, dense_layer=None, initializer='normalized_columns',
+ output_bias_initializer=None):
super().__init__(parameterized_class_name="VHead", activation_function=activation_function, name=name,
dense_layer=dense_layer, num_output_head_copies=num_output_head_copies,
rescale_gradient_from_head_by_factor=rescale_gradient_from_head_by_factor,
loss_weight=loss_weight)
self.initializer = initializer
+ self.output_bias_initializer = output_bias_initializer
class DDPGVHeadParameters(HeadParameters):
def __init__(self, activation_function: str ='relu', name: str='ddpg_v_head_params',
num_output_head_copies: int = 1, rescale_gradient_from_head_by_factor: float = 1.0,
- loss_weight: float = 1.0, dense_layer=None, initializer='normalized_columns'):
+ loss_weight: float = 1.0, dense_layer=None, initializer='normalized_columns',
+ output_bias_initializer=None):
super().__init__(parameterized_class_name="DDPGVHead", activation_function=activation_function, name=name,
dense_layer=dense_layer, num_output_head_copies=num_output_head_copies,
rescale_gradient_from_head_by_factor=rescale_gradient_from_head_by_factor,
loss_weight=loss_weight)
self.initializer = initializer
+ self.output_bias_initializer = output_bias_initializer
class CategoricalQHeadParameters(HeadParameters):
def __init__(self, activation_function: str ='relu', name: str='categorical_q_head_params',
num_output_head_copies: int = 1, rescale_gradient_from_head_by_factor: float = 1.0,
- loss_weight: float = 1.0, dense_layer=None):
+ loss_weight: float = 1.0, dense_layer=None,
+ output_bias_initializer=None):
super().__init__(parameterized_class_name="CategoricalQHead", activation_function=activation_function, name=name,
dense_layer=dense_layer, num_output_head_copies=num_output_head_copies,
rescale_gradient_from_head_by_factor=rescale_gradient_from_head_by_factor,
loss_weight=loss_weight)
+ self.output_bias_initializer = output_bias_initializer
class RegressionHeadParameters(HeadParameters):
def __init__(self, activation_function: str ='relu', name: str='q_head_params',
num_output_head_copies: int = 1, rescale_gradient_from_head_by_factor: float = 1.0,
- loss_weight: float = 1.0, dense_layer=None, scheme=None):
+ loss_weight: float = 1.0, dense_layer=None, scheme=None,
+ output_bias_initializer=None):
super().__init__(parameterized_class_name="RegressionHead", activation_function=activation_function, name=name,
dense_layer=dense_layer, num_output_head_copies=num_output_head_copies,
rescale_gradient_from_head_by_factor=rescale_gradient_from_head_by_factor,
loss_weight=loss_weight)
+ self.output_bias_initializer = output_bias_initializer
class DDPGActorHeadParameters(HeadParameters):
@@ -153,21 +161,23 @@ class PolicyHeadParameters(HeadParameters):
class PPOVHeadParameters(HeadParameters):
def __init__(self, activation_function: str ='relu', name: str='ppo_v_head_params',
num_output_head_copies: int = 1, rescale_gradient_from_head_by_factor: float = 1.0,
- loss_weight: float = 1.0, dense_layer=None):
+ loss_weight: float = 1.0, dense_layer=None, output_bias_initializer=None):
super().__init__(parameterized_class_name="PPOVHead", activation_function=activation_function, name=name,
dense_layer=dense_layer, num_output_head_copies=num_output_head_copies,
rescale_gradient_from_head_by_factor=rescale_gradient_from_head_by_factor,
loss_weight=loss_weight)
+ self.output_bias_initializer = output_bias_initializer
class QHeadParameters(HeadParameters):
def __init__(self, activation_function: str ='relu', name: str='q_head_params',
num_output_head_copies: int = 1, rescale_gradient_from_head_by_factor: float = 1.0,
- loss_weight: float = 1.0, dense_layer=None):
+ loss_weight: float = 1.0, dense_layer=None, output_bias_initializer=None):
super().__init__(parameterized_class_name="QHead", activation_function=activation_function, name=name,
dense_layer=dense_layer, num_output_head_copies=num_output_head_copies,
rescale_gradient_from_head_by_factor=rescale_gradient_from_head_by_factor,
loss_weight=loss_weight)
+ self.output_bias_initializer = output_bias_initializer
class ClassificationHeadParameters(HeadParameters):
@@ -183,11 +193,12 @@ class ClassificationHeadParameters(HeadParameters):
class QuantileRegressionQHeadParameters(HeadParameters):
def __init__(self, activation_function: str ='relu', name: str='quantile_regression_q_head_params',
num_output_head_copies: int = 1, rescale_gradient_from_head_by_factor: float = 1.0,
- loss_weight: float = 1.0, dense_layer=None):
+ loss_weight: float = 1.0, dense_layer=None, output_bias_initializer=None):
super().__init__(parameterized_class_name="QuantileRegressionQHead", activation_function=activation_function, name=name,
dense_layer=dense_layer, num_output_head_copies=num_output_head_copies,
rescale_gradient_from_head_by_factor=rescale_gradient_from_head_by_factor,
loss_weight=loss_weight)
+ self.output_bias_initializer = output_bias_initializer
class RainbowQHeadParameters(HeadParameters):
@@ -218,18 +229,21 @@ class SACPolicyHeadParameters(HeadParameters):
class SACQHeadParameters(HeadParameters):
def __init__(self, activation_function: str ='relu', name: str='sac_q_head_params', dense_layer=None,
- layers_sizes: tuple = (256, 256)):
+ layers_sizes: tuple = (256, 256), output_bias_initializer=None):
super().__init__(parameterized_class_name='SACQHead', activation_function=activation_function, name=name,
dense_layer=dense_layer)
self.network_layers_sizes = layers_sizes
+ self.output_bias_initializer = output_bias_initializer
class TD3VHeadParameters(HeadParameters):
def __init__(self, activation_function: str ='relu', name: str='td3_v_head_params',
num_output_head_copies: int = 1, rescale_gradient_from_head_by_factor: float = 1.0,
- loss_weight: float = 1.0, dense_layer=None, initializer='xavier'):
+ loss_weight: float = 1.0, dense_layer=None, initializer='xavier',
+ output_bias_initializer=None):
super().__init__(parameterized_class_name="TD3VHead", activation_function=activation_function, name=name,
dense_layer=dense_layer, num_output_head_copies=num_output_head_copies,
rescale_gradient_from_head_by_factor=rescale_gradient_from_head_by_factor,
loss_weight=loss_weight)
- self.initializer = initializer
\ No newline at end of file
+ self.initializer = initializer
+ self.output_bias_initializer = output_bias_initializer
diff --git a/rl_coach/architectures/tensorflow_components/heads/categorical_q_head.py b/rl_coach/architectures/tensorflow_components/heads/categorical_q_head.py
index 56427ed..b573fe5 100644
--- a/rl_coach/architectures/tensorflow_components/heads/categorical_q_head.py
+++ b/rl_coach/architectures/tensorflow_components/heads/categorical_q_head.py
@@ -26,9 +26,9 @@ from rl_coach.spaces import SpacesDefinition
class CategoricalQHead(QHead):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str ='relu',
- dense_layer=Dense):
+ dense_layer=Dense, output_bias_initializer=None):
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function,
- dense_layer=dense_layer)
+ dense_layer=dense_layer, output_bias_initializer=output_bias_initializer)
self.name = 'categorical_dqn_head'
self.num_actions = len(self.spaces.action.actions)
self.num_atoms = agent_parameters.algorithm.atoms
@@ -37,7 +37,8 @@ class CategoricalQHead(QHead):
self.loss_type = []
def _build_module(self, input_layer):
- values_distribution = self.dense_layer(self.num_actions * self.num_atoms)(input_layer, name='output')
+ values_distribution = self.dense_layer(self.num_actions * self.num_atoms)\
+ (input_layer, name='output', bias_initializer=self.output_bias_initializer)
values_distribution = tf.reshape(values_distribution, (tf.shape(values_distribution)[0], self.num_actions,
self.num_atoms))
# softmax on atoms dimension
diff --git a/rl_coach/architectures/tensorflow_components/heads/cil_head.py b/rl_coach/architectures/tensorflow_components/heads/cil_head.py
index 15f9de1..f3ae003 100644
--- a/rl_coach/architectures/tensorflow_components/heads/cil_head.py
+++ b/rl_coach/architectures/tensorflow_components/heads/cil_head.py
@@ -27,7 +27,7 @@ from rl_coach.utils import force_list
class RegressionHead(Head):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu',
- dense_layer=Dense, scheme=[Dense(256), Dense(256)]):
+ dense_layer=Dense, scheme=[Dense(256), Dense(256)], output_bias_initializer=None):
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function,
dense_layer=dense_layer)
self.name = 'regression_head'
@@ -42,6 +42,7 @@ class RegressionHead(Head):
self.loss_type = tf.losses.huber_loss
else:
self.loss_type = tf.losses.mean_squared_error
+ self.output_bias_initializer = output_bias_initializer
def _build_module(self, input_layer):
self.layers.append(input_layer)
@@ -50,7 +51,8 @@ class RegressionHead(Head):
layer_params(input_layer=self.layers[-1], name='{}_{}'.format(layer_params.__class__.__name__, idx))
))
- self.layers.append(self.dense_layer(self.num_actions)(self.layers[-1], name='output'))
+ self.layers.append(self.dense_layer(self.num_actions)(self.layers[-1], name='output',
+ bias_initializer=self.output_bias_initializer))
self.output = self.layers[-1]
def __str__(self):
diff --git a/rl_coach/architectures/tensorflow_components/heads/ddpg_v_head.py b/rl_coach/architectures/tensorflow_components/heads/ddpg_v_head.py
index 4c30829..b22fa69 100644
--- a/rl_coach/architectures/tensorflow_components/heads/ddpg_v_head.py
+++ b/rl_coach/architectures/tensorflow_components/heads/ddpg_v_head.py
@@ -24,9 +24,10 @@ from rl_coach.spaces import SpacesDefinition
class DDPGVHead(VHead):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu',
- dense_layer=Dense, initializer='normalized_columns'):
+ dense_layer=Dense, initializer='normalized_columns', output_bias_initializer=None):
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function,
- dense_layer=dense_layer, initializer=initializer)
+ dense_layer=dense_layer, initializer=initializer,
+ output_bias_initializer=output_bias_initializer)
def _build_module(self, input_layer):
super()._build_module(input_layer)
diff --git a/rl_coach/architectures/tensorflow_components/heads/ppo_v_head.py b/rl_coach/architectures/tensorflow_components/heads/ppo_v_head.py
index 968a97a..e2abbfc 100644
--- a/rl_coach/architectures/tensorflow_components/heads/ppo_v_head.py
+++ b/rl_coach/architectures/tensorflow_components/heads/ppo_v_head.py
@@ -26,18 +26,20 @@ from rl_coach.spaces import SpacesDefinition
class PPOVHead(Head):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu',
- dense_layer=Dense):
+ dense_layer=Dense, output_bias_initializer=None):
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function,
dense_layer=dense_layer)
self.name = 'ppo_v_head'
self.clip_likelihood_ratio_using_epsilon = agent_parameters.algorithm.clip_likelihood_ratio_using_epsilon
self.return_type = ActionProbabilities
+ self.output_bias_initializer = output_bias_initializer
def _build_module(self, input_layer):
self.old_policy_value = tf.placeholder(tf.float32, [None], "old_policy_values")
self.input = [self.old_policy_value]
self.output = self.dense_layer(1)(input_layer, name='output',
- kernel_initializer=normalized_columns_initializer(1.0))
+ kernel_initializer=normalized_columns_initializer(1.0),
+ bias_initializer=self.output_bias_initializer)
self.target = self.total_return = tf.placeholder(tf.float32, [None], name="total_return")
value_loss_1 = tf.square(self.output - self.target)
diff --git a/rl_coach/architectures/tensorflow_components/heads/q_head.py b/rl_coach/architectures/tensorflow_components/heads/q_head.py
index ecc1461..0bd120b 100644
--- a/rl_coach/architectures/tensorflow_components/heads/q_head.py
+++ b/rl_coach/architectures/tensorflow_components/heads/q_head.py
@@ -26,7 +26,7 @@ from rl_coach.spaces import SpacesDefinition, BoxActionSpace, DiscreteActionSpac
class QHead(Head):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu',
- dense_layer=Dense):
+ dense_layer=Dense, output_bias_initializer=None):
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function,
dense_layer=dense_layer)
self.name = 'q_values_head'
@@ -46,9 +46,12 @@ class QHead(Head):
else:
self.loss_type = tf.losses.mean_squared_error
+ self.output_bias_initializer = output_bias_initializer
+
def _build_module(self, input_layer):
# Standard Q Network
- self.q_values = self.output = self.dense_layer(self.num_actions)(input_layer, name='output')
+ self.q_values = self.output = self.dense_layer(self.num_actions)\
+ (input_layer, name='output', bias_initializer=self.output_bias_initializer)
# used in batch-rl to estimate a probablity distribution over actions
self.softmax = self.add_softmax_with_temperature()
diff --git a/rl_coach/architectures/tensorflow_components/heads/quantile_regression_q_head.py b/rl_coach/architectures/tensorflow_components/heads/quantile_regression_q_head.py
index 5edcbfb..4e32e91 100644
--- a/rl_coach/architectures/tensorflow_components/heads/quantile_regression_q_head.py
+++ b/rl_coach/architectures/tensorflow_components/heads/quantile_regression_q_head.py
@@ -25,9 +25,9 @@ from rl_coach.spaces import SpacesDefinition
class QuantileRegressionQHead(QHead):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu',
- dense_layer=Dense):
+ dense_layer=Dense, output_bias_initializer=None):
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function,
- dense_layer=dense_layer)
+ dense_layer=dense_layer, output_bias_initializer=output_bias_initializer)
self.name = 'quantile_regression_dqn_head'
self.num_actions = len(self.spaces.action.actions)
self.num_atoms = agent_parameters.algorithm.atoms # we use atom / quantile interchangeably
@@ -43,7 +43,8 @@ class QuantileRegressionQHead(QHead):
self.input = [self.actions, self.quantile_midpoints]
# the output of the head is the N unordered quantile locations {theta_1, ..., theta_N}
- quantiles_locations = self.dense_layer(self.num_actions * self.num_atoms)(input_layer, name='output')
+ quantiles_locations = self.dense_layer(self.num_actions * self.num_atoms)\
+ (input_layer, name='output', bias_initializer=self.output_bias_initializer)
quantiles_locations = tf.reshape(quantiles_locations, (tf.shape(quantiles_locations)[0], self.num_actions, self.num_atoms))
self.output = quantiles_locations
diff --git a/rl_coach/architectures/tensorflow_components/heads/sac_q_head.py b/rl_coach/architectures/tensorflow_components/heads/sac_q_head.py
index cc2d95d..dbac165 100644
--- a/rl_coach/architectures/tensorflow_components/heads/sac_q_head.py
+++ b/rl_coach/architectures/tensorflow_components/heads/sac_q_head.py
@@ -26,7 +26,7 @@ from rl_coach.spaces import SpacesDefinition, BoxActionSpace
class SACQHead(Head):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu',
- dense_layer=Dense):
+ dense_layer=Dense, output_bias_initializer=None):
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function,
dense_layer=dense_layer)
self.name = 'q_values_head'
@@ -41,6 +41,7 @@ class SACQHead(Head):
self.return_type = QActionStateValue
# extract the topology from the SACQHeadParameters
self.network_layers_sizes = agent_parameters.network_wrappers['q'].heads_parameters[0].network_layers_sizes
+ self.output_bias_initializer = output_bias_initializer
def _build_module(self, input_layer):
# SAC Q network is basically 2 networks running in parallel on the same input (state , action)
@@ -63,7 +64,8 @@ class SACQHead(Head):
for layer_size in self.network_layers_sizes[1:]:
qi_output = self.dense_layer(layer_size)(qi_output, activation=self.activation_function)
# the output layer
- self.q1_output = self.dense_layer(1)(qi_output, name='q1_output')
+ self.q1_output = self.dense_layer(1)(qi_output, name='q1_output',
+ bias_initializer=self.output_bias_initializer)
# build q2 network head
with tf.variable_scope("q2_head"):
@@ -74,7 +76,8 @@ class SACQHead(Head):
for layer_size in self.network_layers_sizes[1:]:
qi_output = self.dense_layer(layer_size)(qi_output, activation=self.activation_function)
# the output layer
- self.q2_output = self.dense_layer(1)(qi_output, name='q2_output')
+ self.q2_output = self.dense_layer(1)(qi_output, name='q2_output',
+ bias_initializer=self.output_bias_initializer)
# take the minimum as the network's output. this is the log_target (in the original implementation)
self.q_output = tf.minimum(self.q1_output, self.q2_output, name='q_output')
diff --git a/rl_coach/architectures/tensorflow_components/heads/td3_v_head.py b/rl_coach/architectures/tensorflow_components/heads/td3_v_head.py
index 86457ec..1457e32 100644
--- a/rl_coach/architectures/tensorflow_components/heads/td3_v_head.py
+++ b/rl_coach/architectures/tensorflow_components/heads/td3_v_head.py
@@ -26,7 +26,7 @@ from rl_coach.spaces import SpacesDefinition
class TD3VHead(Head):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu',
- dense_layer=Dense, initializer='xavier'):
+ dense_layer=Dense, initializer='xavier', output_bias_initializer=None):
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function,
dense_layer=dense_layer)
self.name = 'td3_v_values_head'
@@ -35,6 +35,7 @@ class TD3VHead(Head):
self.initializer = initializer
self.loss = []
self.output = []
+ self.output_bias_initializer = output_bias_initializer
def _build_module(self, input_layer):
# Standard V Network
@@ -44,9 +45,11 @@ class TD3VHead(Head):
for i in range(input_layer.shape[0]): # assuming that the actual size is 2, as there are two critic networks
if self.initializer == 'normalized_columns':
q_outputs.append(self.dense_layer(1)(input_layer[i], name='q_output_{}'.format(i + 1),
- kernel_initializer=normalized_columns_initializer(1.0)))
+ kernel_initializer=normalized_columns_initializer(1.0),
+ bias_initializer=self.output_bias_initializer),)
elif self.initializer == 'xavier' or self.initializer is None:
- q_outputs.append(self.dense_layer(1)(input_layer[i], name='q_output_{}'.format(i + 1)))
+ q_outputs.append(self.dense_layer(1)(input_layer[i], name='q_output_{}'.format(i + 1),
+ bias_initializer=self.output_bias_initializer))
self.output.append(q_outputs[i])
self.loss.append(tf.reduce_mean((self.target-q_outputs[i])**2))
diff --git a/rl_coach/architectures/tensorflow_components/heads/v_head.py b/rl_coach/architectures/tensorflow_components/heads/v_head.py
index 62bfba0..16ff185 100644
--- a/rl_coach/architectures/tensorflow_components/heads/v_head.py
+++ b/rl_coach/architectures/tensorflow_components/heads/v_head.py
@@ -26,7 +26,7 @@ from rl_coach.spaces import SpacesDefinition
class VHead(Head):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu',
- dense_layer=Dense, initializer='normalized_columns'):
+ dense_layer=Dense, initializer='normalized_columns', output_bias_initializer=None):
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function,
dense_layer=dense_layer)
self.name = 'v_values_head'
@@ -38,14 +38,17 @@ class VHead(Head):
self.loss_type = tf.losses.mean_squared_error
self.initializer = initializer
+ self.output_bias_initializer = output_bias_initializer
def _build_module(self, input_layer):
# Standard V Network
if self.initializer == 'normalized_columns':
self.output = self.dense_layer(1)(input_layer, name='output',
- kernel_initializer=normalized_columns_initializer(1.0))
+ kernel_initializer=normalized_columns_initializer(1.0),
+ bias_initializer=self.output_bias_initializer)
elif self.initializer == 'xavier' or self.initializer is None:
- self.output = self.dense_layer(1)(input_layer, name='output')
+ self.output = self.dense_layer(1)(input_layer, name='output',
+ bias_initializer=self.output_bias_initializer)
def __str__(self):
result = [
diff --git a/rl_coach/architectures/tensorflow_components/layers.py b/rl_coach/architectures/tensorflow_components/layers.py
index eb63262..91c0c30 100644
--- a/rl_coach/architectures/tensorflow_components/layers.py
+++ b/rl_coach/architectures/tensorflow_components/layers.py
@@ -168,15 +168,18 @@ class Dense(layers.Dense):
def __init__(self, units: int):
super(Dense, self).__init__(units=units)
- def __call__(self, input_layer, name: str=None, kernel_initializer=None, activation=None, is_training=None):
+ def __call__(self, input_layer, name: str=None, kernel_initializer=None, bias_initializer=None,
+ activation=None, is_training=None):
"""
returns a tensorflow dense layer
:param input_layer: previous layer
:param name: layer name
:return: dense layer
"""
+ if bias_initializer is None:
+ bias_initializer = tf.zeros_initializer()
return tf.layers.dense(input_layer, self.units, name=name, kernel_initializer=kernel_initializer,
- activation=activation)
+ activation=activation, bias_initializer=bias_initializer)
@staticmethod
@reg_to_tf_instance(layers.Dense)
@@ -199,7 +202,8 @@ class NoisyNetDense(layers.NoisyNetDense):
def __init__(self, units: int):
super(NoisyNetDense, self).__init__(units=units)
- def __call__(self, input_layer, name: str, kernel_initializer=None, activation=None, is_training=None):
+ def __call__(self, input_layer, name: str, kernel_initializer=None, activation=None, is_training=None,
+ bias_initializer=None):
"""
returns a NoisyNet dense layer
:param input_layer: previous layer
@@ -233,10 +237,12 @@ class NoisyNetDense(layers.NoisyNetDense):
kernel_stddev_initializer = tf.random_uniform_initializer(-stddev * self.sigma0, stddev * self.sigma0)
else:
kernel_mean_initializer = kernel_stddev_initializer = kernel_initializer
+ if bias_initializer is None:
+ bias_initializer = tf.zeros_initializer()
with tf.variable_scope(None, default_name=name):
weight_mean = tf.get_variable('weight_mean', shape=(num_inputs, num_outputs),
initializer=kernel_mean_initializer)
- bias_mean = tf.get_variable('bias_mean', shape=(num_outputs,), initializer=tf.zeros_initializer())
+ bias_mean = tf.get_variable('bias_mean', shape=(num_outputs,), initializer=bias_initializer)
weight_stddev = tf.get_variable('weight_stddev', shape=(num_inputs, num_outputs),
initializer=kernel_stddev_initializer)
diff --git a/rl_coach/filters/reward/reward_normalization_filter.py b/rl_coach/filters/reward/reward_normalization_filter.py
index b708c93..4ad6ccb 100644
--- a/rl_coach/filters/reward/reward_normalization_filter.py
+++ b/rl_coach/filters/reward/reward_normalization_filter.py
@@ -64,15 +64,14 @@ class RewardNormalizationFilter(RewardFilter):
def filter(self, reward: RewardType, update_internal_state: bool=True) -> RewardType:
if update_internal_state:
+ if not isinstance(reward, np.ndarray) or len(reward.shape) < 2:
+ reward = np.array([[reward]])
self.running_rewards_stats.push(reward)
- reward = (reward - self.running_rewards_stats.mean) / \
- (self.running_rewards_stats.std + 1e-15)
- reward = np.clip(reward, self.clip_min, self.clip_max)
-
- return reward
+ return self.running_rewards_stats.normalize(reward).squeeze()
def get_filtered_reward_space(self, input_reward_space: RewardSpace) -> RewardSpace:
+ self.running_rewards_stats.set_params(shape=(1,), clip_values=(self.clip_min, self.clip_max))
return input_reward_space
def save_state_to_checkpoint(self, checkpoint_dir: str, checkpoint_prefix: str):
diff --git a/rl_coach/graph_managers/batch_rl_graph_manager.py b/rl_coach/graph_managers/batch_rl_graph_manager.py
index 5c9da2e..a19f6a3 100644
--- a/rl_coach/graph_managers/batch_rl_graph_manager.py
+++ b/rl_coach/graph_managers/batch_rl_graph_manager.py
@@ -37,7 +37,6 @@ from rl_coach.memories.episodic import EpisodicExperienceReplayParameters
from rl_coach.core_types import TimeTypes
-# TODO build a tutorial for batch RL
class BatchRLGraphManager(BasicRLGraphManager):
"""
A batch RL graph manager creates a scenario of learning from a dataset without a simulator.
@@ -95,6 +94,8 @@ class BatchRLGraphManager(BasicRLGraphManager):
self.schedule_params = schedule_params
def _create_graph(self, task_parameters: TaskParameters) -> Tuple[List[LevelManager], List[Environment]]:
+ assert self.agent_params.memory.load_memory_from_file_path or self.env_params, \
+ "BatchRL requires either a dataset to train from or an environment to collect a dataset from. "
if self.env_params:
# environment loading
self.env_params.seed = task_parameters.seed
@@ -172,36 +173,38 @@ class BatchRLGraphManager(BasicRLGraphManager):
# initialize the network parameters from the global network
self.sync()
- # TODO a bug in heatup where the last episode run is not fed into the ER. e.g. asked for 1024 heatup steps,
- # last ran episode ended increased the total to 1040 steps, but the ER will contain only 1014 steps.
- # The last episode is not there. Is this a bug in my changes or also on master?
+ # If we have both an environment and a dataset to load from, we will use the environment only for
+ # evaluating the policy, and will not run heatup. If no dataset is available to load from, we will be collecting
+ # a dataset from an environment.
+ if not self.agent_params.memory.load_memory_from_file_path:
+ if self.is_collecting_random_dataset:
+ # heatup
+ if self.env_params is not None:
+ screen.log_title(
+ "Collecting random-action experience to use for training the actual agent in a Batch RL "
+ "fashion")
+ # Creating a random dataset during the heatup phase is useful mainly for tutorial and debug
+ # purposes.
+ self.heatup(self.heatup_steps)
+ else:
+ screen.log_title(
+ "Starting to improve an agent collecting experience to use for training the actual agent in a "
+ "Batch RL fashion")
- # Creating a dataset during the heatup phase is useful mainly for tutorial and debug purposes. If we have both
- # an environment and a dataset to load from, we will use the environment only for evaluating the policy,
- # and will not run heatup.
+ # set the experience generating agent to train
+ self.level_managers[0].agents = {'experience_generating_agent': self.experience_generating_agent}
- screen.log_title("Starting to improve an agent collecting experience to use for training the actual agent in a "
- "Batch RL fashion")
+ # collect a dataset using the experience generating agent
+ super().improve()
- if self.is_collecting_random_dataset:
- # heatup
- if self.env_params is not None and not self.agent_params.memory.load_memory_from_file_path:
- self.heatup(self.heatup_steps)
- else:
- # set the experience generating agent to train
- self.level_managers[0].agents = {'experience_generating_agent': self.experience_generating_agent}
+ # set the acquired experience to the actual agent that we're going to train
+ self.agent.memory = self.experience_generating_agent.memory
- # collect a dataset using the experience generating agent
- super().improve()
+ # switch the graph scheduling parameters
+ self.set_schedule_params(self.schedule_params)
- # set the acquired experience to the actual agent that we're going to train
- self.agent.memory = self.experience_generating_agent.memory
-
- # switch the graph scheduling parameters
- self.set_schedule_params(self.schedule_params)
-
- # set the actual agent to train
- self.level_managers[0].agents = {'agent': self.agent}
+ # set the actual agent to train
+ self.level_managers[0].agents = {'agent': self.agent}
# this agent never actually plays
self.level_managers[0].agents['agent'].ap.algorithm.num_consecutive_playing_steps = EnvironmentSteps(0)
diff --git a/rl_coach/memories/episodic/episodic_experience_replay.py b/rl_coach/memories/episodic/episodic_experience_replay.py
index 8a54dad..9e18943 100644
--- a/rl_coach/memories/episodic/episodic_experience_replay.py
+++ b/rl_coach/memories/episodic/episodic_experience_replay.py
@@ -15,6 +15,8 @@
# limitations under the License.
#
import ast
+
+import pickle
from copy import deepcopy
import math
@@ -141,14 +143,27 @@ class EpisodicExperienceReplay(Memory):
def shuffle_episodes(self):
"""
- Shuffle all the episodes in the replay buffer
+ Shuffle all the complete episodes in the replay buffer, while deleting the last non-complete episode
:return:
"""
+ self.reader_writer_lock.lock_writing()
+
self.assert_not_frozen()
+ # unlike the standard usage of the EpisodicExperienceReplay, where we always leave an empty episode after
+ # the last full one, so that new transitions will have where to be added, in this case we delibrately remove
+ # that empty last episode, as we are about to shuffle the memory, and we don't want it to be shuffled in
+ self.remove_last_episode(lock=False)
+
random.shuffle(self._buffer)
self.transitions = [t for e in self._buffer for t in e.transitions]
+ # create a new Episode for the next transitions to be placed into
+ self._buffer.append(Episode(n_step=self.n_step))
+ self._length += 1
+
+ self.reader_writer_lock.release_writing()
+
def get_shuffled_training_data_generator(self, size: int) -> List[Transition]:
"""
Get an generator for iterating through the shuffled replay buffer, for processing the data in epochs.
@@ -201,10 +216,10 @@ class EpisodicExperienceReplay(Memory):
granularity, size = self.max_size
if granularity == MemoryGranularity.Transitions:
while size != 0 and self.num_transitions() > size:
- self._remove_episode(0)
+ self.remove_first_episode(lock=False)
elif granularity == MemoryGranularity.Episodes:
while self.length() > size:
- self._remove_episode(0)
+ self.remove_first_episode(lock=False)
def _update_episode(self, episode: Episode) -> None:
episode.update_transitions_rewards_and_bootstrap_data()
@@ -321,31 +336,53 @@ class EpisodicExperienceReplay(Memory):
def _remove_episode(self, episode_index: int) -> None:
"""
- Remove the episode in the given index (even if it is not complete yet)
- :param episode_index: the index of the episode to remove
+ Remove either the first or the last index
+ :param episode_index: the index of the episode to remove (either 0 or -1)
:return: None
"""
self.assert_not_frozen()
+ assert episode_index == 0 or episode_index == -1, "_remove_episode only supports removing the first or the last " \
+ "episode"
- if len(self._buffer) > episode_index:
+ if len(self._buffer) > 0:
episode_length = self._buffer[episode_index].length()
self._length -= 1
self._num_transitions -= episode_length
self._num_transitions_in_complete_episodes -= episode_length
- del self.transitions[:episode_length]
+ if episode_index == 0:
+ del self.transitions[:episode_length]
+ else: # episode_index = -1
+ del self.transitions[-episode_length:]
del self._buffer[episode_index]
- def remove_episode(self, episode_index: int) -> None:
+ def remove_first_episode(self, lock: bool = True) -> None:
"""
- Remove the episode in the given index (even if it is not complete yet)
- :param episode_index: the index of the episode to remove
+ Remove the first episode (even if it is not complete yet)
+ :param lock: if true, will lock the readers writers lock. this can cause a deadlock if an inheriting class
+ locks and then calls store with lock = True
:return: None
"""
- self.reader_writer_lock.lock_writing_and_reading()
+ if lock:
+ self.reader_writer_lock.lock_writing_and_reading()
- self._remove_episode(episode_index)
+ self._remove_episode(0)
+ if lock:
+ self.reader_writer_lock.release_writing_and_reading()
- self.reader_writer_lock.release_writing_and_reading()
+ def remove_last_episode(self, lock: bool = True) -> None:
+ """
+ Remove the last episode (even if it is not complete yet)
+ :param lock: if true, will lock the readers writers lock. this can cause a deadlock if an inheriting class
+ locks and then calls store with lock = True
+ :return: None
+ """
+ if lock:
+ self.reader_writer_lock.lock_writing_and_reading()
+
+ self._remove_episode(-1)
+
+ if lock:
+ self.reader_writer_lock.release_writing_and_reading()
# for API compatibility
def get(self, episode_index: int, lock: bool = True) -> Union[None, Episode]:
@@ -372,15 +409,6 @@ class EpisodicExperienceReplay(Memory):
return episode
- # for API compatibility
- def remove(self, episode_index: int):
- """
- Remove the episode in the given index (even if it is not complete yet)
- :param episode_index: the index of the episode to remove
- :return: None
- """
- self.remove_episode(episode_index)
-
def clean(self) -> None:
"""
Clean the memory by removing all the episodes
@@ -446,7 +474,7 @@ class EpisodicExperienceReplay(Memory):
transitions.append(
Transition(state={'observation': state},
- action=current_transition['action'], reward=current_transition['reward'],
+ action=int(current_transition['action']), reward=current_transition['reward'],
next_state={'observation': next_state}, game_over=False,
info={'all_action_probabilities':
ast.literal_eval(current_transition['all_action_probabilities'])}),
@@ -516,3 +544,36 @@ class EpisodicExperienceReplay(Memory):
self.last_training_set_episode_id = episode_num
self.last_training_set_transition_id = \
len([t for e in self.get_all_complete_episodes_from_to(0, self.last_training_set_episode_id + 1) for t in e])
+
+ def save(self, file_path: str) -> None:
+ """
+ Save the replay buffer contents to a pickle file
+ :param file_path: the path to the file that will be used to store the pickled transitions
+ """
+ with open(file_path, 'wb') as file:
+ pickle.dump(self.get_all_complete_episodes(), file)
+
+ def load_pickled(self, file_path: str) -> None:
+ """
+ Restore the replay buffer contents from a pickle file.
+ The pickle file is assumed to include a list of transitions.
+ :param file_path: The path to a pickle file to restore
+ """
+ self.assert_not_frozen()
+
+ with open(file_path, 'rb') as file:
+ episodes = pickle.load(file)
+ num_transitions = sum([len(e.transitions) for e in episodes])
+ if num_transitions > self.max_size[1]:
+ screen.warning("Warning! The number of transition to load into the replay buffer ({}) is "
+ "bigger than the max size of the replay buffer ({}). The excessive transitions will "
+ "not be stored.".format(num_transitions, self.max_size[1]))
+
+ progress_bar = ProgressBar(len(episodes))
+ for episode_idx, episode in enumerate(episodes):
+ self.store_episode(episode)
+
+ # print progress
+ progress_bar.update(episode_idx)
+
+ progress_bar.close()
diff --git a/rl_coach/memories/memory.py b/rl_coach/memories/memory.py
index c27d590..cce24fa 100644
--- a/rl_coach/memories/memory.py
+++ b/rl_coach/memories/memory.py
@@ -58,9 +58,6 @@ class Memory(object):
def get(self, index):
raise NotImplementedError("")
- def remove(self, index):
- raise NotImplementedError("")
-
def length(self):
raise NotImplementedError("")
diff --git a/rl_coach/memories/non_episodic/experience_replay.py b/rl_coach/memories/non_episodic/experience_replay.py
index 1570c87..0a9ef6f 100644
--- a/rl_coach/memories/non_episodic/experience_replay.py
+++ b/rl_coach/memories/non_episodic/experience_replay.py
@@ -198,15 +198,6 @@ class ExperienceReplay(Memory):
"""
return self.get_transition(transition_index, lock)
- # for API compatibility
- def remove(self, transition_index: int, lock: bool=True):
- """
- Remove the transition in the given index
- :param transition_index: the index of the transition to remove
- :return: None
- """
- self.remove_transition(transition_index, lock)
-
def clean(self, lock: bool=True) -> None:
"""
Clean the memory by removing all the episodes
diff --git a/rl_coach/presets/Acrobot_DDQN_BCQ_BatchRL.py b/rl_coach/presets/Acrobot_DDQN_BCQ_BatchRL.py
new file mode 100644
index 0000000..cda8a45
--- /dev/null
+++ b/rl_coach/presets/Acrobot_DDQN_BCQ_BatchRL.py
@@ -0,0 +1,116 @@
+import tensorflow as tf
+
+from rl_coach.agents.ddqn_agent import DDQNAgentParameters
+from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
+from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps, CsvDataset
+from rl_coach.environments.gym_environment import GymVectorEnvironment
+from rl_coach.graph_managers.batch_rl_graph_manager import BatchRLGraphManager
+from rl_coach.graph_managers.graph_manager import ScheduleParameters
+from rl_coach.memories.memory import MemoryGranularity
+from rl_coach.schedules import LinearSchedule
+from rl_coach.memories.episodic import EpisodicExperienceReplayParameters
+from rl_coach.architectures.head_parameters import QHeadParameters
+from rl_coach.agents.ddqn_bcq_agent import DDQNBCQAgentParameters
+
+from rl_coach.agents.ddqn_bcq_agent import KNNParameters
+
+DATASET_SIZE = 50000
+
+
+####################
+# Graph Scheduling #
+####################
+
+schedule_params = ScheduleParameters()
+schedule_params.improve_steps = TrainingSteps(10000000000)
+schedule_params.steps_between_evaluation_periods = TrainingSteps(1)
+schedule_params.evaluation_steps = EnvironmentEpisodes(10)
+schedule_params.heatup_steps = EnvironmentSteps(DATASET_SIZE)
+
+#########
+# Agent #
+#########
+
+agent_params = DDQNBCQAgentParameters()
+agent_params.network_wrappers['main'].batch_size = 128
+# TODO cross-DL framework abstraction for a constant initializer?
+agent_params.network_wrappers['main'].heads_parameters = [QHeadParameters(output_bias_initializer=tf.constant_initializer(-100))]
+
+agent_params.algorithm.num_steps_between_copying_online_weights_to_target = TrainingSteps(
+ 100)
+agent_params.algorithm.discount = 0.99
+
+agent_params.algorithm.action_drop_method_parameters = KNNParameters()
+
+# NN configuration
+agent_params.network_wrappers['main'].learning_rate = 0.0001
+agent_params.network_wrappers['main'].replace_mse_with_huber_loss = False
+agent_params.network_wrappers['main'].softmax_temperature = 0.2
+
+# ER size
+agent_params.memory = EpisodicExperienceReplayParameters()
+# DATATSET_PATH = 'acrobot.csv'
+# agent_params.memory.load_memory_from_file_path = CsvDataset(DATATSET_PATH, True)
+
+# E-Greedy schedule
+agent_params.exploration.epsilon_schedule = LinearSchedule(0, 0, 10000)
+agent_params.exploration.evaluation_epsilon = 0
+
+# Experience Generating Agent parameters
+experience_generating_agent_params = DDQNAgentParameters()
+
+# schedule parameters
+experience_generating_schedule_params = ScheduleParameters()
+experience_generating_schedule_params.heatup_steps = EnvironmentSteps(1000)
+experience_generating_schedule_params.improve_steps = TrainingSteps(
+ DATASET_SIZE - experience_generating_schedule_params.heatup_steps.num_steps)
+experience_generating_schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(10)
+experience_generating_schedule_params.evaluation_steps = EnvironmentEpisodes(1)
+
+# DQN params
+experience_generating_agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(100)
+experience_generating_agent_params.algorithm.discount = 0.99
+experience_generating_agent_params.algorithm.num_consecutive_playing_steps = EnvironmentSteps(1)
+
+# NN configuration
+experience_generating_agent_params.network_wrappers['main'].learning_rate = 0.0001
+experience_generating_agent_params.network_wrappers['main'].batch_size = 128
+experience_generating_agent_params.network_wrappers['main'].replace_mse_with_huber_loss = False
+experience_generating_agent_params.network_wrappers['main'].heads_parameters = \
+[QHeadParameters(output_bias_initializer=tf.constant_initializer(-100))]
+
+# ER size
+experience_generating_agent_params.memory = EpisodicExperienceReplayParameters()
+experience_generating_agent_params.memory.max_size = \
+ (MemoryGranularity.Transitions,
+ experience_generating_schedule_params.heatup_steps.num_steps +
+ experience_generating_schedule_params.improve_steps.num_steps + 1)
+
+# E-Greedy schedule
+experience_generating_agent_params.exploration.epsilon_schedule = LinearSchedule(1.0, 0.01, DATASET_SIZE)
+experience_generating_agent_params.exploration.evaluation_epsilon = 0
+
+
+################
+# Environment #
+################
+env_params = GymVectorEnvironment(level='Acrobot-v1')
+
+########
+# Test #
+########
+preset_validation_params = PresetValidationParameters()
+preset_validation_params.test = True
+preset_validation_params.min_reward_threshold = 150
+preset_validation_params.max_episodes_to_achieve_reward = 50
+preset_validation_params.read_csv_tries = 500
+
+graph_manager = BatchRLGraphManager(agent_params=agent_params,
+ experience_generating_agent_params=experience_generating_agent_params,
+ experience_generating_schedule_params=experience_generating_schedule_params,
+ env_params=env_params,
+ schedule_params=schedule_params,
+ vis_params=VisualizationParameters(dump_signals_to_csv_every_x_episodes=1),
+ preset_validation_params=preset_validation_params,
+ reward_model_num_epochs=30,
+ train_to_eval_ratio=0.4)
diff --git a/rl_coach/presets/CartPole_DQN_BatchRL_BCQ.py b/rl_coach/presets/CartPole_DDQN_BCQ_BatchRL.py
similarity index 100%
rename from rl_coach/presets/CartPole_DQN_BatchRL_BCQ.py
rename to rl_coach/presets/CartPole_DDQN_BCQ_BatchRL.py
diff --git a/rl_coach/presets/CartPole_DQN_BatchRL.py b/rl_coach/presets/CartPole_DDQN_BatchRL.py
similarity index 100%
rename from rl_coach/presets/CartPole_DQN_BatchRL.py
rename to rl_coach/presets/CartPole_DDQN_BatchRL.py
diff --git a/tutorials/4. Batch Reinforcement Learning.ipynb b/tutorials/4. Batch Reinforcement Learning.ipynb
new file mode 100644
index 0000000..d424d4e
--- /dev/null
+++ b/tutorials/4. Batch Reinforcement Learning.ipynb
@@ -0,0 +1,378 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Batch Reinforcement Learning with Coach"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In many real-world problems, a learning agent cannot interact with the real environment or with a simlulated one. This might be due to the risk of taking sub-optimal actions in the real world, or due to the complexity of creating a simluator that immitates correctly the real environment dynamics. In such cases, the learning agent is only exposed to data that was collected using some deployed policy, and we would like to use that data to learn a better policy for solving the problem. \n",
+ "One such example might be developing a better drug dose or admission scheduling policy. We have data based on the policy that was used with patients so far, but cannot experiment (and explore) on patients to collect new data. \n",
+ "\n",
+ "But wait... If we don't have a simulator, how would we evaluate our newly learned policy and know if it is any good? Which algorithms should we be using in order to better address the problem of learning only from a batch of data? \n",
+ "\n",
+ "Alternatively, what do we do if we don't have a simulator, but instead we can actually deploy our policy on that real-world environment, and would just like to separate the new data collection part from the learning part (i.e. if we have a system that can quite easily run inference, but is very hard to integrate a reinforcement learning framework with, such as Coach, for learning a new policy).\n",
+ "\n",
+ "We will try to address these questions and more in this tutorial, demonstrating how to use [Batch Reinforcement Learning](http://tgabel.de/cms/fileadmin/user_upload/documents/Lange_Gabel_EtAl_RL-Book-12.pdf). \n",
+ "\n",
+ "First, let's use a simple environment to collect the data to be used for learning a policy using Batch RL. In reality, we probably would already have a dataset of transitions of the form `` to be used for learning a new policy. Ideally, we would also have, for each transtion, $p(a|o)$ the probabilty of an action, given that transition's `current_observation`. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Preliminaries\n",
+ "First, get the required imports and other general settings we need for this notebook."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Solving Acrobot with Batch RL"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from copy import deepcopy\n",
+ "import tensorflow as tf\n",
+ "import os\n",
+ "\n",
+ "from rl_coach.agents.dqn_agent import DQNAgentParameters\n",
+ "from rl_coach.agents.ddqn_bcq_agent import DDQNBCQAgentParameters, KNNParameters\n",
+ "from rl_coach.base_parameters import VisualizationParameters\n",
+ "from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps, CsvDataset\n",
+ "from rl_coach.environments.gym_environment import GymVectorEnvironment\n",
+ "from rl_coach.graph_managers.batch_rl_graph_manager import BatchRLGraphManager\n",
+ "from rl_coach.graph_managers.graph_manager import ScheduleParameters\n",
+ "from rl_coach.memories.memory import MemoryGranularity\n",
+ "from rl_coach.schedules import LinearSchedule\n",
+ "from rl_coach.memories.episodic import EpisodicExperienceReplayParameters\n",
+ "from rl_coach.architectures.head_parameters import QHeadParameters\n",
+ "from rl_coach.agents.ddqn_agent import DDQNAgentParameters\n",
+ "from rl_coach.base_parameters import TaskParameters\n",
+ "from rl_coach.spaces import SpacesDefinition, DiscreteActionSpace, VectorObservationSpace, StateSpace, RewardSpace\n",
+ "\n",
+ "# Get all the outputs of this tutorial out of the 'Resources' folder\n",
+ "os.chdir('Resources')\n",
+ "\n",
+ "# the dataset size to collect \n",
+ "DATASET_SIZE = 50000\n",
+ "\n",
+ "task_parameters = TaskParameters(experiment_path='.')\n",
+ "\n",
+ "####################\n",
+ "# Graph Scheduling #\n",
+ "####################\n",
+ "\n",
+ "schedule_params = ScheduleParameters()\n",
+ "\n",
+ "# 100 epochs (we run train over all the dataset, every epoch) of training\n",
+ "schedule_params.improve_steps = TrainingSteps(100)\n",
+ "\n",
+ "# we evaluate the model every epoch\n",
+ "schedule_params.steps_between_evaluation_periods = TrainingSteps(1)\n",
+ "\n",
+ "# only for when we have an enviroment\n",
+ "schedule_params.evaluation_steps = EnvironmentEpisodes(10)\n",
+ "schedule_params.heatup_steps = EnvironmentSteps(DATASET_SIZE)\n",
+ "\n",
+ "################\n",
+ "# Environment #\n",
+ "################\n",
+ "env_params = GymVectorEnvironment(level='Acrobot-v1')\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Let's use OpenAI Gym's `Acrobot-v1` in order to collect a dataset of experience, and then use that dataset in order to learn a policy solving the environment using Batch RL. \n",
+ "\n",
+ "### The Preset \n",
+ "\n",
+ "First we will collect a dataset using a random action selecting policy. Then we will use that dataset to train an agent in a Batch RL fashion. \n",
+ "Let's start simple - training an agent with Double DQN. \n",
+ "\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tf.reset_default_graph() # just to clean things up; only needed for the tutorial\n",
+ "\n",
+ "#########\n",
+ "# Agent #\n",
+ "#########\n",
+ "agent_params = DDQNAgentParameters()\n",
+ "agent_params.network_wrappers['main'].batch_size = 128\n",
+ "agent_params.algorithm.num_steps_between_copying_online_weights_to_target = TrainingSteps(100)\n",
+ "agent_params.algorithm.discount = 0.99\n",
+ "\n",
+ "# to jump start the agent's q values, and speed things up, we'll initialize the last Dense layer's bias\n",
+ "# with a number in the order of the discounted reward of a random policy\n",
+ "agent_params.network_wrappers['main'].heads_parameters = \\\n",
+ "[QHeadParameters(output_bias_initializer=tf.constant_initializer(-100))]\n",
+ "\n",
+ "# NN configuration\n",
+ "agent_params.network_wrappers['main'].learning_rate = 0.0001\n",
+ "agent_params.network_wrappers['main'].replace_mse_with_huber_loss = False\n",
+ "\n",
+ "# ER - we'll need an episodic replay buffer for off-policy evaluation\n",
+ "agent_params.memory = EpisodicExperienceReplayParameters()\n",
+ "\n",
+ "# E-Greedy schedule - there is no exploration in Batch RL. Disabling E-Greedy. \n",
+ "agent_params.exploration.epsilon_schedule = LinearSchedule(initial_value=0, final_value=0, decay_steps=1)\n",
+ "agent_params.exploration.evaluation_epsilon = 0\n",
+ "\n",
+ "\n",
+ "graph_manager = BatchRLGraphManager(agent_params=agent_params,\n",
+ " env_params=env_params,\n",
+ " schedule_params=schedule_params,\n",
+ " vis_params=VisualizationParameters(dump_signals_to_csv_every_x_episodes=1),\n",
+ " reward_model_num_epochs=30)\n",
+ "graph_manager.create_graph(task_parameters)\n",
+ "graph_manager.improve()\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "First we see Coach running a long heatup of 50,000 steps (as we have defined a `DATASET_SIZE` of 50,000 in the preliminaries section), in order to collect a dataset of random actions. Then we can see Coach training a supervised reward model that is needed for the `Doubly Robust` OPE (off-policy evaluation). Last, Coach starts using the collected dataset of experience to train a Double DQN agent. Since, for this environment, we actually do have a simulator, Coach will be using it to evaluate the learned policy. As you can probably see, since this is a very simple environment, a dataset of just random actions is enough to get a Double DQN agent training, and reaching rewards of less than -100 (actually solving the environment). As you can also probably notice, the learning is not very stable, and if you take a look at the Q values predicted by the agent (e.g. in Coach Dashboard; this tutorial experiment results are under the `Resources` folder), you will see them increasing unboundedly. This is caused due to the Batch RL based learning, where not interacting with the environment any further, while randomly exposing only small parts of the MDP in the dataset, makes learning even harder than standard Off-Policy RL. This phenomena is very nicely explained in [Off-Policy Deep Reinforcement Learning without Exploration](https://arxiv.org/abs/1812.02900). We have implemented a discrete-actions variant of [Batch Constrained Q-Learning](https://github.com/NervanaSystems/coach/blob/master/rl_coach/agents/ddqn_bcq_agent.py), which helps mitigating this issue. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Next, let's switch to a dataset containing data combined from several 'deployed' policies, as is often the case in real-world scenarios, where we already have a policy (hopefully not a random one) in-place and we want to improve it. For instance, a recommender system already using a policy for generating recommendations, and we want to use Batch RL to learn a better policy. \n",
+ "\n",
+ "We will demonstrate that by training an agent, and using its replay buffer content as the dataset from which we will learn a new policy, without any further interaction with the environment. This should allow for both a better trained agent and for more meaningful Off-Policy Evaluation (as the more extensive your input data is, i.e. exposing more of the MDP, the better the evaluation of a new policy based on it)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tf.reset_default_graph() # just to clean things up; only needed for the tutorial\n",
+ "\n",
+ "# Experience Generating Agent parameters\n",
+ "experience_generating_agent_params = DDQNAgentParameters()\n",
+ "\n",
+ "# schedule parameters\n",
+ "experience_generating_schedule_params = ScheduleParameters()\n",
+ "experience_generating_schedule_params.heatup_steps = EnvironmentSteps(1000)\n",
+ "experience_generating_schedule_params.improve_steps = TrainingSteps(\n",
+ " DATASET_SIZE - experience_generating_schedule_params.heatup_steps.num_steps)\n",
+ "experience_generating_schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(10)\n",
+ "experience_generating_schedule_params.evaluation_steps = EnvironmentEpisodes(1)\n",
+ "\n",
+ "# DQN params\n",
+ "experience_generating_agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(100)\n",
+ "experience_generating_agent_params.algorithm.discount = 0.99\n",
+ "experience_generating_agent_params.algorithm.num_consecutive_playing_steps = EnvironmentSteps(1)\n",
+ "\n",
+ "# NN configuration\n",
+ "experience_generating_agent_params.network_wrappers['main'].learning_rate = 0.0001\n",
+ "experience_generating_agent_params.network_wrappers['main'].batch_size = 128\n",
+ "experience_generating_agent_params.network_wrappers['main'].replace_mse_with_huber_loss = False\n",
+ "experience_generating_agent_params.network_wrappers['main'].heads_parameters = \\\n",
+ "[QHeadParameters(output_bias_initializer=tf.constant_initializer(-100))]\n",
+ "\n",
+ "# ER size\n",
+ "experience_generating_agent_params.memory = EpisodicExperienceReplayParameters()\n",
+ "experience_generating_agent_params.memory.max_size = \\\n",
+ " (MemoryGranularity.Transitions,\n",
+ " experience_generating_schedule_params.heatup_steps.num_steps +\n",
+ " experience_generating_schedule_params.improve_steps.num_steps)\n",
+ "\n",
+ "# E-Greedy schedule\n",
+ "experience_generating_agent_params.exploration.epsilon_schedule = LinearSchedule(1.0, 0.01, DATASET_SIZE)\n",
+ "experience_generating_agent_params.exploration.evaluation_epsilon = 0\n",
+ "\n",
+ "# 50 epochs of training (the entire dataset is used each epoch)\n",
+ "schedule_params.improve_steps = TrainingSteps(50)\n",
+ "\n",
+ "graph_manager = BatchRLGraphManager(agent_params=agent_params,\n",
+ " experience_generating_agent_params=experience_generating_agent_params,\n",
+ " experience_generating_schedule_params=experience_generating_schedule_params,\n",
+ " env_params=env_params,\n",
+ " schedule_params=schedule_params,\n",
+ " vis_params=VisualizationParameters(dump_signals_to_csv_every_x_episodes=1),\n",
+ " reward_model_num_epochs=30,\n",
+ " train_to_eval_ratio=0.5)\n",
+ "graph_manager.create_graph(task_parameters)\n",
+ "graph_manager.improve()\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Off-Policy Evaluation\n",
+ "As we mentioned earlier, one of the hardest problems in Batch RL is that we do not have a simulator or cannot easily deploy a trained policy on the real-world environment, in order to test its goodness. This is where OPE comes in handy. \n",
+ "\n",
+ "Coach supports several off-policy evaluators, some are useful for bandits problems (only evaluating a single step return), and others are for full-blown Reinforcement Learning problems. The main goal of the OPEs is to help us select the best model, either for collecting more data to do another round of Batch RL on, or for actual deployment in the real-world environment. \n",
+ "\n",
+ "Opening the experiment that we have just ran (under the `tutorials/Resources` folder, with Coach Dashboard), you will be able to plot the actual simulator's `Evaluation Reward`. Usually, we won't have this signal available as we won't have a simulator, but since we're using a dummy environment for demonstration purposes, we can take a look and examine how the OPEs correlate with it. \n",
+ "\n",
+ "Here are two example plots from Dashboard showing how well the `Weighted Importance Sampling` (RL estimator) and the `Doubly Robust` (bandits estimator) each correlate with the `Evaluation Reward`. \n",
+ " \n",
+ "\n",
+ " \n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Using a Dataset to Feed a Batch RL Algorithm "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Ok, so we now understand how things are expected to work. But, hey... if we don't have a simulator (which we did have in this tutorial so far, and have used it to generate a training/evaluation dataset) how will we feed Coach with the dataset to train/evaluate on?\n",
+ "\n",
+ "### The CSV\n",
+ "Coach defines a csv data format that can be used to fill its replay buffer. We have created an example csv from the same `Acrobot-v1` environment, and have placed it under the [Tutorials' Resources folder](https://github.com/NervanaSystems/coach/tree/master/tutorials/Resources).\n",
+ "\n",
+ "Here are the first couple of lines from it so you can get a grip of what to expect - \n",
+ "\n",
+ "| action | all_action_probabilities | episode_id | episode_name | reward | transition_number | state_feature_0 | state_feature_1 | state_feature_2 | state_feature_3 | state_feature_4 | state_feature_5 \n",
+ "|---|---|---|---|---|---|---|---|---|---|---|---------------------------------------------------------------------------|\n",
+ "|0|[0.4159157,0.23191088,0.35217342]|0|acrobot|-1|0|0.996893843|0.078757007|0.997566524|0.069721088|-0.078539907|-0.072449002 |\n",
+ "|1|[0.46244532,0.22402011,0.31353462]|0|acrobot|-1|1|0.997643051|0.068617369|0.999777604|0.021088905|-0.022653483|-0.40743716|\n",
+ "|0|[0.4961428,0.21575058,0.2881066]|0|acrobot|-1|2|0.997613067|0.069051922|0.996147629|-0.087692077|0.023128103|-0.662019594|\n",
+ "|2|[0.49341106,0.22363988,0.28294897]|0|acrobot|-1|3|0.997141344|0.075558854|0.972780655|-0.231727853|0.035575821|-0.771402023|\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### The Preset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tf.reset_default_graph() # just to clean things up; only needed for the tutorial\n",
+ "\n",
+ "#########\n",
+ "# Agent #\n",
+ "#########\n",
+ "# note that we have moved to BCQ, which will help the training to converge better and faster\n",
+ "agent_params = DDQNBCQAgentParameters() \n",
+ "agent_params.network_wrappers['main'].batch_size = 128\n",
+ "agent_params.algorithm.num_steps_between_copying_online_weights_to_target = TrainingSteps(100)\n",
+ "agent_params.algorithm.discount = 0.99\n",
+ "\n",
+ "# to jump start the agent's q values, and speed things up, we'll initialize the last Dense layer\n",
+ "# with something in the order of the discounted reward of a random policy\n",
+ "agent_params.network_wrappers['main'].heads_parameters = \\\n",
+ "[QHeadParameters(output_bias_initializer=tf.constant_initializer(-100))]\n",
+ "\n",
+ "# NN configuration\n",
+ "agent_params.network_wrappers['main'].learning_rate = 0.0001\n",
+ "agent_params.network_wrappers['main'].replace_mse_with_huber_loss = False\n",
+ "\n",
+ "# ER - we'll be needing an episodic replay buffer for off-policy evaluation\n",
+ "agent_params.memory = EpisodicExperienceReplayParameters()\n",
+ "\n",
+ "# E-Greedy schedule - there is no exploration in Batch RL. Disabling E-Greedy. \n",
+ "agent_params.exploration.epsilon_schedule = LinearSchedule(initial_value=0, final_value=0, decay_steps=1)\n",
+ "agent_params.exploration.evaluation_epsilon = 0\n",
+ "\n",
+ "# can use either a kNN or a NN based model for predicting which actions not to max over in the bellman equation\n",
+ "agent_params.algorithm.action_drop_method_parameters = KNNParameters()\n",
+ "\n",
+ "\n",
+ "DATATSET_PATH = 'acrobot_dataset.csv'\n",
+ "agent_params.memory = EpisodicExperienceReplayParameters()\n",
+ "agent_params.memory.load_memory_from_file_path = CsvDataset(DATATSET_PATH, is_episodic = True)\n",
+ "\n",
+ "spaces = SpacesDefinition(state=StateSpace({'observation': VectorObservationSpace(shape=6)}),\n",
+ " goal=None,\n",
+ " action=DiscreteActionSpace(3),\n",
+ " reward=RewardSpace(1))\n",
+ "\n",
+ "graph_manager = BatchRLGraphManager(agent_params=agent_params,\n",
+ " env_params=None,\n",
+ " spaces_definition=spaces,\n",
+ " schedule_params=schedule_params,\n",
+ " vis_params=VisualizationParameters(dump_signals_to_csv_every_x_episodes=1),\n",
+ " reward_model_num_epochs=30,\n",
+ " train_to_eval_ratio=0.4)\n",
+ "graph_manager.create_graph(task_parameters)\n",
+ "graph_manager.improve()\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Model Selection with OPE\n",
+ "Running the above preset will train an agent based on the experience in the csv dataset. Note that now we are finally demonstarting the real scenario with Batch Reinforcement Learning, where we train and evaluate solely based on the recorded dataset. Coach uses the same dataset (after internally splitting it, obviously) for both training and evaluation. \n",
+ "\n",
+ "Now that we have ran this preset, we have 100 agents (one is saved after every training epoch), and we would have to decide which one we choose for deployment (either for running another round of experience collection and training, or for final deployment, meaning going into production). \n",
+ "\n",
+ "Opening the experiment csv in Dashboard and displaying the OPE signals, we can now choose a checkpoint file for deployment on the end-node. Here is an example run, where we show the `Weighted Importance Sampling` and `Sequential Doubly Robust` OPEs. \n",
+ "\n",
+ " \n",
+ "\n",
+ "Based on this plot we would probably have chosen a checkpoint from around Epoch 85. From here, if we are not satisfied with the deployed agent's performance, we can iteratively continue with data collection, policy training (maybe based on a combination of all the data collected so far), and deployment. \n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tutorials/Resources/img/dr.png b/tutorials/Resources/img/dr.png
new file mode 100644
index 0000000000000000000000000000000000000000..925069870ada060aae3399360a8b837c1ab2f565
GIT binary patch
literal 108159
zcmd42V|ZR&w+0&9Xlyh_W81dV*ya-(ZJfqV8e5HRqd{Y%v90su{l2~TtMm6DI%gM
zB_cws=wxqZX=4fkBKajT8Cvz*BAWl%TM-8g1Sv7aHD!BH0*W>W#ZMtsVKhl9W=aSY
zp4z5BYaua+(XR~37`pJp)}W#_^nr*^P+F?9t~CZCyDwd2#~to#Ui)$r8BeC4+|5q;
z+$T~&1iPIn;$>6d1d}**aB+?&!xbb=HlRRoi$TGdz+D&=C=H!BH$mb`Zq6=W+5HE8
z;jwSmyuRGM(O7v0L_&k~32}xnZkv*L<4Z3!?+jG2Sc_VfUGdIOBT5U
zSyx;8A*K$@Fe0I|PwGi$48v5GfkSTin|UgVx4$Ii@D9U^K-IRNjC!FrO&Cs|NiHin
zn%2Qk=IXE}exZru$Vxc%hl`KTA2q3vzoZ&vMJ96gd)o=7IN9&5oKwqVLe?ZY)-Fz~
z(8WTE%hk9z=6}*CP3X@ss&wJ5;w2Ne#DZHGxqs;S-9$Zs+CPzLq$RGpP>(EQnLI+4
zcmv