diff --git a/bot/src/dqn/burnrl/dqn_model.rs b/bot/src/dqn/burnrl/dqn_model.rs index 9cf72a1..7e1c797 100644 --- a/bot/src/dqn/burnrl/dqn_model.rs +++ b/bot/src/dqn/burnrl/dqn_model.rs @@ -164,6 +164,7 @@ pub fn run, B: AutodiffBackend>( let mut episode_duration = 0_usize; let mut state = env.state(); let mut now = SystemTime::now(); + let mut goodmoves_ratio = 0.0; while !episode_done { let eps_threshold = conf.eps_end @@ -192,13 +193,17 @@ pub fn run, B: AutodiffBackend>( episode_duration += 1; if snapshot.done() || episode_duration >= conf.max_steps { - env.reset(); - episode_done = true; - + let envmut = env.as_mut(); println!( - "{{\"episode\": {episode}, \"reward\": {episode_reward:.4}, \"steps count\": {episode_duration}, \"threshold\": {eps_threshold:.3}, \"duration\": {}}}", + "{{\"episode\": {episode}, \"reward\": {episode_reward:.4}, \"steps count\": {episode_duration}, \"epsilon\": {eps_threshold:.3}, \"goodmoves\": {}, \"gm%\": {:.1}, \"rollpoints\":{}, \"duration\": {}}}", + envmut.goodmoves_count, + goodmoves_ratio * 100.0, + envmut.pointrolls_count, now.elapsed().unwrap().as_secs(), ); + goodmoves_ratio = envmut.goodmoves_ratio; + env.reset(); + episode_done = true; now = SystemTime::now(); } else { state = *snapshot.state(); diff --git a/bot/src/dqn/burnrl/environment.rs b/bot/src/dqn/burnrl/environment.rs index 5cc37c4..a774b12 100644 --- a/bot/src/dqn/burnrl/environment.rs +++ b/bot/src/dqn/burnrl/environment.rs @@ -86,6 +86,7 @@ pub struct TrictracEnvironment { pub step_count: usize, pub min_steps: f32, pub max_steps: usize, + pub pointrolls_count: usize, pub goodmoves_count: usize, pub goodmoves_ratio: f32, pub visualized: bool, @@ -118,6 +119,7 @@ impl Environment for TrictracEnvironment { step_count: 0, min_steps: 250.0, max_steps: 2000, + pointrolls_count: 0, goodmoves_count: 0, goodmoves_ratio: 0.0, visualized, @@ -150,6 +152,7 @@ impl Environment for TrictracEnvironment { (100.0 * self.goodmoves_ratio).round() as u32 ); self.step_count = 0; + self.pointrolls_count = 0; self.goodmoves_count = 0; Snapshot::new(self.current_state, 0.0, false) @@ -162,12 +165,16 @@ impl Environment for TrictracEnvironment { let trictrac_action = Self::convert_action(action); let mut reward = 0.0; + let mut is_rollpoint = false; let mut terminated = false; // Exécuter l'action si c'est le tour de l'agent DQN if self.game.active_player_id == self.active_player_id { if let Some(action) = trictrac_action { - reward = self.execute_action(action); + (reward, is_rollpoint) = self.execute_action(action); + if is_rollpoint { + self.pointrolls_count += 1; + } if reward != Self::ERROR_REWARD { self.goodmoves_count += 1; } @@ -249,10 +256,11 @@ impl TrictracEnvironment { // &mut self, // action: dqn_common::TrictracAction, // ) -> Result> { - fn execute_action(&mut self, action: dqn_common::TrictracAction) -> f32 { + fn execute_action(&mut self, action: dqn_common::TrictracAction) -> (f32, bool) { use dqn_common::TrictracAction; let mut reward = 0.0; + let mut is_rollpoint = false; let event = match action { TrictracAction::Roll => { @@ -330,7 +338,8 @@ impl TrictracEnvironment { let (points, adv_points) = self.game.dice_points; reward += Self::REWARD_RATIO * (points - adv_points) as f32; if points > 0 { - println!("info: rolled for {reward}"); + is_rollpoint = true; + // println!("info: rolled for {reward}"); } // Récompense proportionnelle aux points } @@ -343,7 +352,7 @@ impl TrictracEnvironment { } } - reward + (reward, is_rollpoint) } /// Fait jouer l'adversaire avec une stratégie simple diff --git a/bot/src/dqn/burnrl/main.rs b/bot/src/dqn/burnrl/main.rs index d6162df..d8b200f 100644 --- a/bot/src/dqn/burnrl/main.rs +++ b/bot/src/dqn/burnrl/main.rs @@ -14,24 +14,25 @@ fn main() { // See also MEMORY_SIZE in dqn_model.rs : 8192 let conf = dqn_model::DqnConfig { - num_episodes: 40, - min_steps: 250.0, // min steps by episode (mise à jour par la fonction) - max_steps: 2000, // max steps by episode - dense_size: 256, // neural network complexity - eps_start: 0.9, // epsilon initial value (0.9 => more exploration) - eps_end: 0.05, + // defaults + num_episodes: 40, // 40 + min_steps: 500.0, // 1000 min of max steps by episode (mise à jour par la fonction) + max_steps: 3000, // 1000 max steps by episode + dense_size: 256, // 128 neural network complexity (default 128) + eps_start: 0.9, // 0.9 epsilon initial value (0.9 => more exploration) + eps_end: 0.05, // 0.05 // eps_decay higher = epsilon decrease slower // used in : epsilon = eps_end + (eps_start - eps_end) * e^(-step / eps_decay); // epsilon is updated at the start of each episode - eps_decay: 3000.0, + eps_decay: 2000.0, // 1000 ? - gamma: 0.999, // discount factor. Plus élevé = encourage stratégies à long terme - tau: 0.005, // soft update rate. Taux de mise à jour du réseau cible. Plus bas = adaptation + gamma: 0.999, // 0.999 discount factor. Plus élevé = encourage stratégies à long terme + tau: 0.005, // 0.005 soft update rate. Taux de mise à jour du réseau cible. Plus bas = adaptation // plus lente moins sensible aux coups de chance - learning_rate: 0.001, // taille du pas. Bas : plus lent, haut : risque de ne jamais + learning_rate: 0.001, // 0.001 taille du pas. Bas : plus lent, haut : risque de ne jamais // converger - batch_size: 32, // nombre d'expériences passées sur lesquelles pour calcul de l'erreur moy. - clip_grad: 100.0, // plafonnement du gradient : limite max de correction à apporter + batch_size: 32, // 32 nombre d'expériences passées sur lesquelles pour calcul de l'erreur moy. + clip_grad: 100.0, // 100 limite max de correction à apporter au gradient (default 100) }; println!("{conf}----------"); let agent = dqn_model::run::(&conf, false); //true);