PPO(Proximal Policy Optimization)

class srl.algorithms.ppo.Config(observation_mode: Literal['', 'render_image'] = '', override_env_observation_type: srl.base.define.SpaceTypes = <SpaceTypes.UNKNOWN: 1>, override_observation_type: Union[str, srl.base.define.RLBaseTypes] = <RLBaseTypes.NONE: 1>, override_action_type: Union[str, srl.base.define.RLBaseTypes] = <RLBaseTypes.NONE: 1>, action_division_num: int = 10, observation_division_num: int = 1000, frameskip: int = 0, extend_worker: Optional[Type[ForwardRef('ExtendWorker')]] = None, processors: List[ForwardRef('RLProcessor')] = <factory>, render_image_processors: List[ForwardRef('RLProcessor')] = <factory>, enable_rl_processors: bool = True, enable_state_encode: bool = True, enable_action_decode: bool = True, window_length: int = 1, render_image_window_length: int = 1, render_last_step: bool = True, render_rl_image: bool = True, render_rl_image_size: Tuple[int, int] = (128, 128), enable_sanitize: bool = True, enable_assertion: bool = False, dtype: str = 'float32', batch_size: int = 32, memory: srl.rl.memories.replay_buffer.ReplayBufferConfig = <factory>, input_value_block: srl.rl.models.config.input_value_block.InputValueBlockConfig = <factory>, input_image_block: srl.rl.models.config.input_image_block.InputImageBlockConfig = <factory>, experience_collection_method: str = 'GAE', discount: float = 0.9, gae_discount: float = 0.9, baseline_type: str = 'advantage', surrogate_type: str = 'clip', policy_clip_range: float = 0.2, adaptive_kl_target: float = 0.01, enable_value_clip: float = True, value_clip_range: float = 0.2, lr: float = 0.02, lr_scheduler: srl.rl.schedulers.lr_scheduler.LRSchedulerConfig = <factory>, value_loss_weight: float = 1.0, entropy_weight: float = 0.1, enable_state_normalized: bool = False, global_gradient_clip_norm: float = 0.5, state_clip: Optional[Tuple[float, float]] = None, reward_clip: Optional[Tuple[float, float]] = None, enable_stable_gradients: bool = True, stable_gradients_scale_range: tuple = (1e-10, 10))

batch_size: int = 32: Batch size

memory: ReplayBufferConfig: <ReplayBuffer>

input_value_block: InputValueBlockConfig: <InputValueBlock>

input_image_block: InputImageBlockConfig: <InputImageBlock>

hidden_block: MLPBlockConfig: <MLPBlock> hidden layers

value_block: MLPBlockConfig: <MLPBlock> value layers

policy_block: MLPBlockConfig: <MLPBlock> policy layers

experience_collection_method: str = 'GAE'

割引報酬の計算方法

パラメータ:

"MC" -- モンテカルロ法
"GAE" -- Generalized Advantage Estimator

discount: float = 0.9: discount

gae_discount: float = 0.9: GAEの割引率

baseline_type: str = 'advantage'

baseline

パラメータ:

"none" ("") -- none
"ave" -- (adv - mean)
"std" -- adv/std
"normal" -- (adv - mean)/std
"v" ("advantage") -- adv - v

surrogate_type: str = 'clip'

surrogate type

パラメータ:

"" -- none
"clip" -- Clipped Surrogate Objective
"kl" -- Adaptive KLペナルティ

policy_clip_range: float = 0.2: Clipped Surrogate Objective

adaptive_kl_target: float = 0.01: Adaptive KLペナルティ内の定数

enable_value_clip: float = True: value clip flag

value_clip_range: float = 0.2: value clip range

lr: float = 0.02: Learning rate

lr_scheduler: LRSchedulerConfig: <LRSchaduler>

value_loss_weight: float = 1.0: 状態価値の反映率

entropy_weight: float = 0.1: エントロピーの反映率

enable_state_normalized: bool = False: 状態の正規化 flag

global_gradient_clip_norm: float = 0.5: 勾配のL2におけるclip値(0で無効)

state_clip: Tuple[float, float] | None = None: 状態のclip(Noneで無効、(-10,10)で指定)

reward_clip: Tuple[float, float] | None = None: 報酬のclip(Noneで無効、(-10,10)で指定)

enable_stable_gradients: bool = True: 勾配爆発の対策, 平均、分散、ランダムアクションで大きい値を出さないようにclipする

stable_gradients_scale_range: tuple = (1e-10, 10): enable_stable_gradients状態での標準偏差のclip

get_processors(prev_observation_space: SpaceBase) → List[RLProcessor]: 前処理を追加したい場合設定