MuZero

class srl.algorithms.muzero.Config(observation_mode: Literal['', 'render_image'] = '', override_env_observation_type: srl.base.define.SpaceTypes = <SpaceTypes.UNKNOWN: 0>, override_observation_type: Union[str, srl.base.define.RLBaseTypes] = <RLBaseTypes.NONE: 1>, override_action_type: Union[str, srl.base.define.RLBaseTypes] = <RLBaseTypes.NONE: 1>, action_division_num: int = 10, observation_division_num: int = 1000, frameskip: int = 0, extend_worker: Optional[Type[ForwardRef('ExtendWorker')]] = None, processors: List[ForwardRef('RLProcessor')] = <factory>, render_image_processors: List[ForwardRef('RLProcessor')] = <factory>, enable_rl_processors: bool = True, enable_state_encode: bool = True, enable_action_decode: bool = True, window_length: int = 1, render_image_window_length: int = 1, render_last_step: bool = True, render_rl_image: bool = True, render_rl_image_size: Tuple[int, int] = (128, 128), enable_sanitize: bool = True, enable_assertion: bool = False, dtype: str = 'float32', num_simulations: int = 50, discount: float = 0.99, batch_size: int = 32, memory: srl.rl.memories.priority_replay_buffer.PriorityReplayBufferConfig = <factory>, input_image_block: srl.rl.models.config.input_image_block.InputImageBlockConfig = <factory>, lr: float = 0.001, lr_scheduler: srl.rl.schedulers.lr_scheduler.LRSchedulerConfig = <factory>, reward_range: tuple = (-10, 10), reward_range_num: int = 100, value_range: tuple = (-10, 10), value_range_num: int = 100, test_policy_tau: float = 0.1, policy_tau: Optional[float] = None, policy_tau_scheduler: srl.rl.schedulers.scheduler.SchedulerConfig = <factory>, unroll_steps: int = 3, root_dirichlet_alpha: float = 0.3, root_exploration_fraction: float = 0.25, c_base: float = 19652, c_init: float = 1.25, dynamics_blocks: int = 15, reward_dense_units: int = 0, weight_decay: float = 0.0001, enable_rescale: bool = False, enable_reanalyze: bool = False)
num_simulations: int = 50

シミュレーション回数

discount: float = 0.99

割引率

batch_size: int = 32

Batch size

memory: PriorityReplayBufferConfig

<PriorityReplayBuffer>

input_image_block: InputImageBlockConfig

<InputImageBlock>

lr: float = 0.001

Learning rate

lr_scheduler: LRSchedulerConfig

<LRSchaduler>

reward_range: tuple = (-10, 10)

カテゴリ化する範囲

value_range: tuple = (-10, 10)

カテゴリ化する範囲

policy_tau_scheduler: SchedulerConfig

<Scheduler>

unroll_steps: int = 3

unroll_steps

root_dirichlet_alpha: float = 0.3

Root prior exploration noise.

root_exploration_fraction: float = 0.25

Root prior exploration noise.

c_base: float = 19652

PUCT

c_init: float = 1.25

PUCT

dynamics_blocks: int = 15

Dynamics networkのブロック数

reward_dense_units: int = 0

reward dense units

weight_decay: float = 0.0001

weight decay

enable_rescale: bool = False

rescale

enable_reanalyze: bool = False

reanalyze