balance_ppo.yaml 2.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
  1. experiment:
  2. name: balance_ppo
  3. robot:
  4. model_name: guguji
  5. urdf_path: guguji_ros2_ws/src/guguji_ros2/urdf/guguji.urdf
  6. joint_names:
  7. - left_hip_pitch_joint
  8. - left_knee_pitch_joint
  9. - left_ankle_pitch_joint
  10. - left_ankle_joint
  11. - right_hip_pitch_joint
  12. - right_knee_pitch_joint
  13. - right_ankle_pitch_joint
  14. - right_ankle_joint
  15. command_topic_prefix: /guguji/command
  16. # 第一版先围绕一个安全站姿做小范围残差控制,避免随机策略一步把机器人打翻。
  17. nominal_joint_positions:
  18. left_hip_pitch_joint: 0.0
  19. left_knee_pitch_joint: 0.0
  20. left_ankle_pitch_joint: 0.0
  21. left_ankle_joint: 0.0
  22. right_hip_pitch_joint: 0.0
  23. right_knee_pitch_joint: 0.0
  24. right_ankle_pitch_joint: 0.0
  25. right_ankle_joint: 0.0
  26. action_scale: 0.12
  27. action_smoothing: 0.85
  28. ros:
  29. joint_state_topic: /joint_states
  30. tf_topic: /tf
  31. clock_topic: /clock
  32. world_control_service: /world/default/control
  33. sim:
  34. world_name: default
  35. # 强化学习训练更适合用 service_step,能让每一步更可控、更容易复现。
  36. step_mode: service_step
  37. control_dt: 0.05
  38. # 每次动作只推进较少的物理步,降低初期随机动作的破坏性。
  39. service_step_iterations: 15
  40. reset_settle_seconds: 1.0
  41. # reset 后先保持几步名义站姿,让机器人回到更稳定的初始状态。
  42. reset_hold_steps: 8
  43. action_publish_delay: 0.01
  44. post_step_wait_seconds: 0.01
  45. # 这里保留 GUI 版启动提示,方便你直观看训练时机器人在做什么。
  46. launch_hint: ros2 launch guguji_ros2 gazebo.launch.py pause:=true
  47. task:
  48. target_forward_velocity: 0.0
  49. target_base_height: null
  50. # 第一轮先放宽跌倒判定,避免轻微摆动就立刻结束 episode。
  51. max_roll_rad: 1.00
  52. max_pitch_rad: 1.00
  53. min_base_height: 0.24
  54. termination_grace_steps: 8
  55. rewards:
  56. alive_bonus: 2.0
  57. velocity_tracking_scale: 1.5
  58. velocity_tracking_sigma: 0.30
  59. upright_scale: 2.0
  60. height_scale: 1.0
  61. action_rate_penalty_scale: 0.01
  62. joint_limit_penalty_scale: 0.05
  63. lateral_velocity_penalty_scale: 0.10
  64. fall_penalty: -8.0
  65. training:
  66. algorithm: ppo
  67. total_timesteps: 200000
  68. max_episode_steps: 400
  69. seed: 42
  70. device: auto
  71. learning_rate: 0.0003
  72. # 单 Gazebo 环境训练很慢,这里先把 rollout 变短,便于更快看到日志反馈。
  73. n_steps: 128
  74. batch_size: 64
  75. gamma: 0.99
  76. gae_lambda: 0.95
  77. clip_range: 0.2
  78. ent_coef: 0.0
  79. vf_coef: 0.5
  80. policy_net_arch: [256, 256]
  81. checkpoint_freq: 20000
  82. output_root: guguji_rl/outputs
  83. evaluation:
  84. episodes: 3
  85. deterministic: true