SAC(Soft-Actor-Critic)

class srl.algorithms.sac.Config(batch_size: int = 32, memory_capacity: int = 100000, memory_warmup_size: int = 1000, memory_compress: bool = True, memory_compress_level: int = -1, observation_mode: str | ~srl.base.define.ObservationModes = ObservationModes.ENV, override_observation_type: ~srl.base.define.SpaceTypes = SpaceTypes.UNKNOWN, override_action_type: str | ~srl.base.define.RLBaseActTypes = <RLBaseActTypes.NONE: 1>, action_division_num: int = 10, observation_division_num: int = 1000, frameskip: int = 0, extend_worker: ~typing.Type[ExtendWorker] | None = None, parameter_path: str = '', memory_path: str = '', use_rl_processor: bool = True, processors: ~typing.List[RLProcessor] = <factory>, render_image_processors: ~typing.List[RLProcessor] = <factory>, enable_state_encode: bool = True, enable_action_decode: bool = True, enable_reward_encode: bool = True, enable_done_encode: bool = True, window_length: int = 1, render_image_window_length: int = 1, enable_sanitize: bool = True, enable_assertion: bool = False, discount: float = 0.9, lr_policy: float | ~srl.rl.schedulers.scheduler.SchedulerConfig = 0.001, lr_q: float | ~srl.rl.schedulers.scheduler.SchedulerConfig = 0.001, lr_alpha: float | ~srl.rl.schedulers.scheduler.SchedulerConfig = 0.001, soft_target_update_tau: float = 0.02, hard_target_update_interval: int = 100, enable_normal_squashed: bool = True, entropy_alpha_auto_scale: bool = True, entropy_alpha: float = 0.2, entropy_bonus_exclude_q: float = False, enable_stable_gradients: bool = True, stable_gradients_scale_range: tuple = (1e-10, 10))

<ExperienceReplayBuffer> <RLConfigComponentInput>

policy_hidden_block: MLPBlockConfig

<MLPBlock> policy layer

q_hidden_block: MLPBlockConfig

<MLPBlock>

discount: float = 0.9

discount

lr_policy: float | SchedulerConfig = 0.001

policy learning rate

lr_q: float | SchedulerConfig = 0.001

q learning rate

lr_alpha: float | SchedulerConfig = 0.001

alpha learning rate

soft_target_update_tau: float = 0.02

soft_target_update_tau

hard_target_update_interval: int = 100

hard_target_update_interval

enable_normal_squashed: bool = True

actionが連続値の時、正規分布をtanhで-1~1に丸めるか

entropy_alpha_auto_scale: bool = True

entropy alphaを自動調整するか

entropy_alpha: float = 0.2

entropy alphaの初期値

entropy_bonus_exclude_q: float = False

Q値の計算からエントロピーボーナスを除外します

enable_stable_gradients: bool = True

勾配爆発の対策, 平均、分散、ランダムアクションで大きい値を出さないようにclipする

stable_gradients_scale_range: tuple = (1e-10, 10)

enable_stable_gradients状態での標準偏差のclip