forest_navigating_uav/configs/training/sac_hybrid_stage1.yaml
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
experiment:
  name: sac_fastsim_hybrid_stage1
  runs_dir: outputs/runs
  seed: 42
  total_timesteps: 3_000_000

env:
  env_kwargs:
    params:
      dt: 0.1
      lidar_num_beams: 90
      lidar_range_max: 30.0
      v_max: 6.0
      wz_max: 2.5
      vz_max: 2.0
      r_safe: 0.22
      episode_seconds: 60.0
      goal_tolerance: 0.5
      world_radius: 20.0
      boundary_margin: 1.5
      collision_threshold: 0.18
      default_z_target: 2.0
      z_error_scale: 5.0

      reward_progress_scale: 5.0
      reward_speed_scale: 0.02
      reward_step_penalty: 0.02
      reward_proximity_scale: 0.25
      reward_shield_penalty: 0.02
      reward_collision_penalty: 15.0
      reward_success_bonus: 20.0
      reward_truncation_penalty: 8.0
      reward_yaw_rate_scale: 0.03
      reward_stall_penalty: 0.05
      reward_accel_clip_penalty: 0.03

      progress_stall_threshold: 0.02
      yaw_penalty_speed_gate: 0.25
      shield_yaw_damping: 0.35
      shield_lookahead_margin: 2.0
      shield_floor_z_min: 0.05

      worldgen_config_relpath: configs/worldgen/worldgen_run.yaml
      worldgen_seed_offset: 0
      tree_radius_mean: 0.14
      tree_radius_std: 0.03
      tree_radius_min: 0.08
      tree_radius_max: 0.30
      start_goal_clearance: 0.6
      min_start_goal_distance: 8.0
      spawn_max_attempts: 500

      action_mode: hybrid
      accel_v_max: 5.0
      accel_wz_max: 3.5
      accel_vz_max: 2.5
      decel_v_max: 10.0
      decel_wz_max: 5.0
      decel_vz_max: 4.0

training:
  n_envs: 16
  save_checkpoints: false
  checkpoint_freq_step: 100_000
  final_snapshot_freq_step: 200_000
  save_replay_buffer: false
  norm:
    enabled: true
    norm_obs: true
    norm_reward: true
    clip_obs: 10.0
    clip_reward: 10.0

logging:
  eval_freq_step: 50_000
  n_eval_episodes: 5

sac:
  policy: MlpPolicy
  learning_rate: 0.0003
  buffer_size: 1_000_000
  learning_starts: 10_000
  batch_size: 1024
  tau: 0.005
  gamma: 0.99
  train_freq: 4
  gradient_steps: 8
  ent_coef: auto
  target_update_interval: 1
  verbose: 1
  policy_kwargs:
    net_arch: [512, 512]

save_vecnorm: true