experiment:
name: sac_fastsim
runs_dir: outputs/runs
seed: 42
total_timesteps: 10_000_000
env:
env_kwargs:
params:
dt: 0.1
lidar_num_beams: 90 # fewer beams = faster CPU stepping
lidar_range_max: 30.0
v_max: 6.0
wz_max: 2.5
vz_max: 2.0
r_safe: 0.22
episode_seconds: 60.0
goal_tolerance: 0.5
world_radius: 20.0
boundary_margin: 1.5
collision_threshold: 0.18
default_z_target: 2.0
z_error_scale: 5.0
reward_progress_scale: 5.0
reward_speed_scale: 0.02
reward_step_penalty: 0.02
reward_proximity_scale: 0.35
reward_shield_penalty: 0.02
reward_collision_penalty: 20.0
reward_success_bonus: 20.0
reward_truncation_penalty: 8.0
reward_yaw_rate_scale: 0.03
reward_stall_penalty: 0.05
reward_accel_clip_penalty: 0.05
progress_stall_threshold: 0.02
yaw_penalty_speed_gate: 0.25
shield_yaw_damping: 0.35
shield_lookahead_margin: 2.5
shield_floor_z_min: 0.05
worldgen_config_relpath: configs/worldgen/worldgen_run.yaml
worldgen_seed_offset: 0
worldgen_resample_every_n_episodes: 1 # increase to 2-4 for faster training resets
worldgen_verbose: false # keep false during training to reduce I/O
tree_radius_mean: 0.14
tree_radius_std: 0.03
tree_radius_min: 0.08
tree_radius_max: 0.30
start_goal_clearance: 0.6
min_start_goal_distance: 8.0
spawn_max_attempts: 500
# dynamics control mode: "hybrid" applies acceleration limits before integration
# so the policy must plan a smooth velocity trajectory (better sim-to-real transfer).
action_mode: hybrid
accel_v_max: 4.5 # m/s**2: easier early learning under hybrid dynamics
accel_wz_max: 3.0 # rad/s**2
accel_vz_max: 2.0 # m/s**2
decel_v_max: 9.0 # m/s**2: stronger braking than acceleration
decel_wz_max: 4.5 # rad/s**2
decel_vz_max: 3.0 # m/s**2
training:
n_envs: 16
save_checkpoints: false
checkpoint_freq_step: 100_000
final_snapshot_freq_step: 200_000
save_replay_buffer: false
norm:
enabled: true
norm_obs: true
norm_reward: true
clip_obs: 10.0
clip_reward: 10.0
logging:
eval_freq_step: 50_000
n_eval_episodes: 5
metrics_log_freq_step: 1_000
command_center_episode_window: 200
sac:
policy: MlpPolicy
learning_rate: 0.0003
buffer_size: 1_000_000
learning_starts: 10_000
batch_size: 1024 # larger batches for better GPU utilization
tau: 0.005
gamma: 0.99
train_freq: 4 # collect 4 steps before updating
gradient_steps: 8 # 4 gradient steps per train_freq
ent_coef: auto
target_update_interval: 1
verbose: 1
policy_kwargs:
net_arch: [512, 512] # larger network for more GPU work
save_vecnorm: true