refacto: burnrl

2025-08-20 13:09:57 +02:00 · 2025-08-20 13:09:57 +02:00 · 18e85744d6
commit 18e85744d6
parent 97167ff389
27 changed files with 387 additions and 1092 deletions
--- a/bot/Cargo.toml
+++ b/bot/Cargo.toml
@ -5,6 +5,10 @@ edition = "2021"

 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

+[[bin]]
+name = "burn_demo"
+path = "src/burnrl/main.rs"
+
 [[bin]]
 name = "train_dqn_burn_valid"
 path = "src/burnrl/dqn_valid/main.rs"
--- a/bot/src/burnrl/dqn/main.rs
+++ b/bot/src/burnrl/dqn/main.rs
@ -1,54 +0,0 @@
-use bot::burnrl::dqn::{
-    dqn_model,
-    utils::{demo_model, load_model, save_model},
-};
-use bot::burnrl::environment;
-use burn::backend::{Autodiff, NdArray};
-use burn_rl::agent::DQN;
-use burn_rl::base::ElemType;
-
-type Backend = Autodiff<NdArray<ElemType>>;
-type Env = environment::TrictracEnvironment;
-
-fn main() {
-    // println!("> Entraînement");
-
-    // See also MEMORY_SIZE in dqn_model.rs : 8192
-    let conf = dqn_model::DqnConfig {
-        //                   defaults
-        num_episodes: 50,  // 40
-        min_steps: 1000.0, // 1000 min of max steps by episode (mise à jour par la fonction)
-        max_steps: 1000,   // 1000 max steps by episode
-        dense_size: 256,   // 128  neural network complexity (default 128)
-        eps_start: 0.9,    // 0.9  epsilon initial value (0.9 => more exploration)
-        eps_end: 0.05,     // 0.05
-        // eps_decay higher = epsilon decrease slower
-        // used in : epsilon = eps_end + (eps_start - eps_end) * e^(-step / eps_decay);
-        // epsilon is updated at the start of each episode
-        eps_decay: 2000.0, // 1000 ?
-
-        gamma: 0.9999, // 0.999 discount factor. Plus élevé = encourage stratégies à long terme
-        tau: 0.0005, // 0.005 soft update rate. Taux de mise à jour du réseau cible. Plus bas = adaptation
-        // plus lente moins sensible aux coups de chance
-        learning_rate: 0.001, // 0.001 taille du pas. Bas : plus lent, haut : risque de ne jamais
-        // converger
-        batch_size: 128, // 32 nombre d'expériences passées sur lesquelles pour calcul de l'erreur moy.
-        clip_grad: 70.0, // 100 limite max de correction à apporter au gradient (default 100)
-    };
-    println!("{conf}----------");
-    let agent = dqn_model::run::<Env, Backend>(&conf, false); //true);
-
-    let valid_agent = agent.valid();
-
-    println!("> Sauvegarde du modèle de validation");
-
-    let path = "bot/models/burnrl_dqn".to_string();
-    save_model(valid_agent.model().as_ref().unwrap(), &path);
-
-    println!("> Chargement du modèle pour test");
-    let loaded_model = load_model(conf.dense_size, &path);
-    let loaded_agent = DQN::new(loaded_model.unwrap());
-
-    println!("> Test avec le modèle chargé");
-    demo_model(loaded_agent);
-}
--- a/bot/src/burnrl/dqn/mod.rs
+++ b/bot/src/burnrl/dqn/mod.rs
@ -1,2 +0,0 @@
-pub mod dqn_model;
-pub mod utils;
--- a/bot/src/burnrl/dqn/utils.rs
+++ b/bot/src/burnrl/dqn/utils.rs
@ -1,112 +0,0 @@
-use crate::burnrl::dqn::dqn_model;
-use crate::burnrl::environment::{TrictracAction, TrictracEnvironment};
-use crate::training_common::get_valid_action_indices;
-use burn::backend::{ndarray::NdArrayDevice, NdArray};
-use burn::module::{Module, Param, ParamId};
-use burn::nn::Linear;
-use burn::record::{CompactRecorder, Recorder};
-use burn::tensor::backend::Backend;
-use burn::tensor::cast::ToElement;
-use burn::tensor::Tensor;
-use burn_rl::agent::{DQNModel, DQN};
-use burn_rl::base::{Action, ElemType, Environment, State};
-
-pub fn save_model(model: &dqn_model::Net<NdArray<ElemType>>, path: &String) {
-    let recorder = CompactRecorder::new();
-    let model_path = format!("{path}.mpk");
-    println!("Modèle de validation sauvegardé : {model_path}");
-    recorder
-        .record(model.clone().into_record(), model_path.into())
-        .unwrap();
-}
-
-pub fn load_model(dense_size: usize, path: &String) -> Option<dqn_model::Net<NdArray<ElemType>>> {
-    let model_path = format!("{path}.mpk");
-    // println!("Chargement du modèle depuis : {model_path}");
-
-    CompactRecorder::new()
-        .load(model_path.into(), &NdArrayDevice::default())
-        .map(|record| {
-            dqn_model::Net::new(
-                <TrictracEnvironment as Environment>::StateType::size(),
-                dense_size,
-                <TrictracEnvironment as Environment>::ActionType::size(),
-            )
-            .load_record(record)
-        })
-        .ok()
-}
-
-pub fn demo_model<B: Backend, M: DQNModel<B>>(agent: DQN<TrictracEnvironment, B, M>) {
-    let mut env = TrictracEnvironment::new(true);
-    let mut done = false;
-    while !done {
-        // let action = match infer_action(&agent, &env, state) {
-        let action = match infer_action(&agent, &env) {
-            Some(value) => value,
-            None => break,
-        };
-        // Execute action
-        let snapshot = env.step(action);
-        done = snapshot.done();
-    }
-}
-
-fn infer_action<B: Backend, M: DQNModel<B>>(
-    agent: &DQN<TrictracEnvironment, B, M>,
-    env: &TrictracEnvironment,
-) -> Option<TrictracAction> {
-    let state = env.state();
-    // Get q-values
-    let q_values = agent
-        .model()
-        .as_ref()
-        .unwrap()
-        .infer(state.to_tensor().unsqueeze());
-    // Get valid actions
-    let valid_actions_indices = get_valid_action_indices(&env.game);
-    if valid_actions_indices.is_empty() {
-        return None; // No valid actions, end of episode
-    }
-    // Set non valid actions q-values to lowest
-    let mut masked_q_values = q_values.clone();
-    let q_values_vec: Vec<f32> = q_values.into_data().into_vec().unwrap();
-    for (index, q_value) in q_values_vec.iter().enumerate() {
-        if !valid_actions_indices.contains(&index) {
-            masked_q_values = masked_q_values.clone().mask_fill(
-                masked_q_values.clone().equal_elem(*q_value),
-                f32::NEG_INFINITY,
-            );
-        }
-    }
-    // Get best action (highest q-value)
-    let action_index = masked_q_values.argmax(1).into_scalar().to_u32();
-    let action = TrictracAction::from(action_index);
-    Some(action)
-}
-
-fn soft_update_tensor<const N: usize, B: Backend>(
-    this: &Param<Tensor<B, N>>,
-    that: &Param<Tensor<B, N>>,
-    tau: ElemType,
-) -> Param<Tensor<B, N>> {
-    let that_weight = that.val();
-    let this_weight = this.val();
-    let new_weight = this_weight * (1.0 - tau) + that_weight * tau;
-
-    Param::initialized(ParamId::new(), new_weight)
-}
-
-pub fn soft_update_linear<B: Backend>(
-    this: Linear<B>,
-    that: &Linear<B>,
-    tau: ElemType,
-) -> Linear<B> {
-    let weight = soft_update_tensor(&this.weight, &that.weight, tau);
-    let bias = match (&this.bias, &that.bias) {
-        (Some(this_bias), Some(that_bias)) => Some(soft_update_tensor(this_bias, that_bias, tau)),
-        _ => None,
-    };
-
-    Linear::<B> { weight, bias }
-}
--- a/bot/src/burnrl/dqn_big/main.rs
+++ b/bot/src/burnrl/dqn_big/main.rs
@ -1,54 +0,0 @@
-use bot::burnrl::dqn_big::{
-    dqn_model,
-    utils::{demo_model, load_model, save_model},
-};
-use bot::burnrl::environment_big;
-use burn::backend::{Autodiff, NdArray};
-use burn_rl::agent::DQN;
-use burn_rl::base::ElemType;
-
-type Backend = Autodiff<NdArray<ElemType>>;
-type Env = environment_big::TrictracEnvironment;
-
-fn main() {
-    // println!("> Entraînement");
-
-    // See also MEMORY_SIZE in dqn_model.rs : 8192
-    let conf = dqn_model::DqnConfig {
-        //                   defaults
-        num_episodes: 40,  // 40
-        min_steps: 2000.0, // 1000 min of max steps by episode (mise à jour par la fonction)
-        max_steps: 4000,   // 1000 max steps by episode
-        dense_size: 128,   // 128  neural network complexity (default 128)
-        eps_start: 0.9,    // 0.9  epsilon initial value (0.9 => more exploration)
-        eps_end: 0.05,     // 0.05
-        // eps_decay higher = epsilon decrease slower
-        // used in : epsilon = eps_end + (eps_start - eps_end) * e^(-step / eps_decay);
-        // epsilon is updated at the start of each episode
-        eps_decay: 1000.0, // 1000 ?
-
-        gamma: 0.999, // 0.999 discount factor. Plus élevé = encourage stratégies à long terme
-        tau: 0.005, // 0.005 soft update rate. Taux de mise à jour du réseau cible. Plus bas = adaptation
-        // plus lente moins sensible aux coups de chance
-        learning_rate: 0.001, // 0.001 taille du pas. Bas : plus lent, haut : risque de ne jamais
-        // converger
-        batch_size: 32, // 32 nombre d'expériences passées sur lesquelles pour calcul de l'erreur moy.
-        clip_grad: 100.0, // 100 limite max de correction à apporter au gradient (default 100)
-    };
-    println!("{conf}----------");
-    let agent = dqn_model::run::<Env, Backend>(&conf, false); //true);
-
-    let valid_agent = agent.valid();
-
-    println!("> Sauvegarde du modèle de validation");
-
-    let path = "models/burn_dqn_40".to_string();
-    save_model(valid_agent.model().as_ref().unwrap(), &path);
-
-    println!("> Chargement du modèle pour test");
-    let loaded_model = load_model(conf.dense_size, &path);
-    let loaded_agent = DQN::new(loaded_model.unwrap());
-
-    println!("> Test avec le modèle chargé");
-    demo_model(loaded_agent);
-}
--- a/bot/src/burnrl/dqn_big/mod.rs
+++ b/bot/src/burnrl/dqn_big/mod.rs
@ -1,2 +0,0 @@
-pub mod dqn_model;
-pub mod utils;
--- a/bot/src/burnrl/dqn_big/utils.rs
+++ b/bot/src/burnrl/dqn_big/utils.rs
@ -1,112 +0,0 @@
-use crate::burnrl::dqn_big::dqn_model;
-use crate::burnrl::environment_big::{TrictracAction, TrictracEnvironment};
-use crate::training_common_big::get_valid_action_indices;
-use burn::backend::{ndarray::NdArrayDevice, NdArray};
-use burn::module::{Module, Param, ParamId};
-use burn::nn::Linear;
-use burn::record::{CompactRecorder, Recorder};
-use burn::tensor::backend::Backend;
-use burn::tensor::cast::ToElement;
-use burn::tensor::Tensor;
-use burn_rl::agent::{DQNModel, DQN};
-use burn_rl::base::{Action, ElemType, Environment, State};
-
-pub fn save_model(model: &dqn_model::Net<NdArray<ElemType>>, path: &String) {
-    let recorder = CompactRecorder::new();
-    let model_path = format!("{path}_model.mpk");
-    println!("Modèle de validation sauvegardé : {model_path}");
-    recorder
-        .record(model.clone().into_record(), model_path.into())
-        .unwrap();
-}
-
-pub fn load_model(dense_size: usize, path: &String) -> Option<dqn_model::Net<NdArray<ElemType>>> {
-    let model_path = format!("{path}_model.mpk");
-    // println!("Chargement du modèle depuis : {model_path}");
-
-    CompactRecorder::new()
-        .load(model_path.into(), &NdArrayDevice::default())
-        .map(|record| {
-            dqn_model::Net::new(
-                <TrictracEnvironment as Environment>::StateType::size(),
-                dense_size,
-                <TrictracEnvironment as Environment>::ActionType::size(),
-            )
-            .load_record(record)
-        })
-        .ok()
-}
-
-pub fn demo_model<B: Backend, M: DQNModel<B>>(agent: DQN<TrictracEnvironment, B, M>) {
-    let mut env = TrictracEnvironment::new(true);
-    let mut done = false;
-    while !done {
-        // let action = match infer_action(&agent, &env, state) {
-        let action = match infer_action(&agent, &env) {
-            Some(value) => value,
-            None => break,
-        };
-        // Execute action
-        let snapshot = env.step(action);
-        done = snapshot.done();
-    }
-}
-
-fn infer_action<B: Backend, M: DQNModel<B>>(
-    agent: &DQN<TrictracEnvironment, B, M>,
-    env: &TrictracEnvironment,
-) -> Option<TrictracAction> {
-    let state = env.state();
-    // Get q-values
-    let q_values = agent
-        .model()
-        .as_ref()
-        .unwrap()
-        .infer(state.to_tensor().unsqueeze());
-    // Get valid actions
-    let valid_actions_indices = get_valid_action_indices(&env.game);
-    if valid_actions_indices.is_empty() {
-        return None; // No valid actions, end of episode
-    }
-    // Set non valid actions q-values to lowest
-    let mut masked_q_values = q_values.clone();
-    let q_values_vec: Vec<f32> = q_values.into_data().into_vec().unwrap();
-    for (index, q_value) in q_values_vec.iter().enumerate() {
-        if !valid_actions_indices.contains(&index) {
-            masked_q_values = masked_q_values.clone().mask_fill(
-                masked_q_values.clone().equal_elem(*q_value),
-                f32::NEG_INFINITY,
-            );
-        }
-    }
-    // Get best action (highest q-value)
-    let action_index = masked_q_values.argmax(1).into_scalar().to_u32();
-    let action = TrictracAction::from(action_index);
-    Some(action)
-}
-
-fn soft_update_tensor<const N: usize, B: Backend>(
-    this: &Param<Tensor<B, N>>,
-    that: &Param<Tensor<B, N>>,
-    tau: ElemType,
-) -> Param<Tensor<B, N>> {
-    let that_weight = that.val();
-    let this_weight = this.val();
-    let new_weight = this_weight * (1.0 - tau) + that_weight * tau;
-
-    Param::initialized(ParamId::new(), new_weight)
-}
-
-pub fn soft_update_linear<B: Backend>(
-    this: Linear<B>,
-    that: &Linear<B>,
-    tau: ElemType,
-) -> Linear<B> {
-    let weight = soft_update_tensor(&this.weight, &that.weight, tau);
-    let bias = match (&this.bias, &that.bias) {
-        (Some(this_bias), Some(that_bias)) => Some(soft_update_tensor(this_bias, that_bias, tau)),
-        _ => None,
-    };
-
-    Linear::<B> { weight, bias }
-}
--- a/bot/src/burnrl/dqn_valid/dqn_model.rs
+++ b/bot/src/burnrl/dqn_valid/dqn_model.rs
@ -1,15 +1,16 @@
-use crate::burnrl::dqn_valid::utils::soft_update_linear;
-use crate::burnrl::environment::TrictracEnvironment;
+use crate::burnrl::environment_big::TrictracEnvironment;
+use crate::burnrl::utils::{soft_update_linear, Config};
+use burn::backend::{ndarray::NdArrayDevice, NdArray};
 use burn::module::Module;
 use burn::nn::{Linear, LinearConfig};
 use burn::optim::AdamWConfig;
+use burn::record::{CompactRecorder, Recorder};
 use burn::tensor::activation::relu;
 use burn::tensor::backend::{AutodiffBackend, Backend};
 use burn::tensor::Tensor;
 use burn_rl::agent::DQN;
 use burn_rl::agent::{DQNModel, DQNTrainingConfig};
-use burn_rl::base::{Action, ElemType, Environment, Memory, Model, State};
-use std::fmt;
+use burn_rl::base::{Action, Agent, ElemType, Environment, Memory, Model, State};
 use std::time::SystemTime;

 #[derive(Module, Debug)]
@ -62,66 +63,18 @@ impl<B: Backend> DQNModel<B> for Net<B> {
 #[allow(unused)]
 const MEMORY_SIZE: usize = 8192;

-pub struct DqnConfig {
-    pub max_steps: usize,
-    pub num_episodes: usize,
-    pub dense_size: usize,
-    pub eps_start: f64,
-    pub eps_end: f64,
-    pub eps_decay: f64,
-
-    pub gamma: f32,
-    pub tau: f32,
-    pub learning_rate: f32,
-    pub batch_size: usize,
-    pub clip_grad: f32,
-}
-
-impl fmt::Display for DqnConfig {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        let mut s = String::new();
-        s.push_str(&format!("max_steps={:?}\n", self.max_steps));
-        s.push_str(&format!("num_episodes={:?}\n", self.num_episodes));
-        s.push_str(&format!("dense_size={:?}\n", self.dense_size));
-        s.push_str(&format!("eps_start={:?}\n", self.eps_start));
-        s.push_str(&format!("eps_end={:?}\n", self.eps_end));
-        s.push_str(&format!("eps_decay={:?}\n", self.eps_decay));
-        s.push_str(&format!("gamma={:?}\n", self.gamma));
-        s.push_str(&format!("tau={:?}\n", self.tau));
-        s.push_str(&format!("learning_rate={:?}\n", self.learning_rate));
-        s.push_str(&format!("batch_size={:?}\n", self.batch_size));
-        s.push_str(&format!("clip_grad={:?}\n", self.clip_grad));
-        write!(f, "{s}")
-    }
-}
-
-impl Default for DqnConfig {
-    fn default() -> Self {
-        Self {
-            max_steps: 2000,
-            num_episodes: 1000,
-            dense_size: 256,
-            eps_start: 0.9,
-            eps_end: 0.05,
-            eps_decay: 1000.0,
-
-            gamma: 0.999,
-            tau: 0.005,
-            learning_rate: 0.001,
-            batch_size: 32,
-            clip_grad: 100.0,
-        }
-    }
-}
-
 type MyAgent<E, B> = DQN<E, B, Net<B>>;

 #[allow(unused)]
-pub fn run<E: Environment + AsMut<TrictracEnvironment>, B: AutodiffBackend>(
-    conf: &DqnConfig,
+// pub fn run<E: Environment + AsMut<TrictracEnvironment>, B: AutodiffBackend>(
+pub fn run<
+    E: Environment + AsMut<TrictracEnvironment>,
+    B: AutodiffBackend<InnerBackend = NdArray>,
+>(
+    conf: &Config,
    visualized: bool,
-) -> DQN<E, B, Net<B>> {
-    // ) -> impl Agent<E> {
+    // ) -> DQN<E, B, Net<B>> {
+) -> impl Agent<E> {
    let mut env = E::new(visualized);
    env.as_mut().max_steps = conf.max_steps;

@ -189,8 +142,13 @@ pub fn run<E: Environment + AsMut<TrictracEnvironment>, B: AutodiffBackend>(

            if snapshot.done() || episode_duration >= conf.max_steps {
                let envmut = env.as_mut();
+                let goodmoves_ratio = ((envmut.goodmoves_count as f32 / episode_duration as f32)
+                    * 100.0)
+                    .round() as u32;
                println!(
-                    "{{\"episode\": {episode}, \"reward\": {episode_reward:.4}, \"steps count\": {episode_duration}, \"epsilon\": {eps_threshold:.3}, \"rollpoints\":{}, \"duration\": {}}}",
+                    "{{\"episode\": {episode}, \"reward\": {episode_reward:.4}, \"steps count\": {episode_duration}, \"epsilon\": {eps_threshold:.3}, \"goodmoves\": {}, \"ratio\": {}%, \"rollpoints\":{}, \"duration\": {}}}",
+                    envmut.goodmoves_count,
+                    goodmoves_ratio,
                    envmut.pointrolls_count,
                    now.elapsed().unwrap().as_secs(),
                );
@ -202,5 +160,35 @@ pub fn run<E: Environment + AsMut<TrictracEnvironment>, B: AutodiffBackend>(
            }
        }
    }
-    agent
+    let valid_agent = agent.valid();
+    if let Some(path) = &conf.save_path {
+        save_model(valid_agent.model().as_ref().unwrap(), path);
+    }
+    valid_agent
+}
+
+pub fn save_model(model: &Net<NdArray<ElemType>>, path: &String) {
+    let recorder = CompactRecorder::new();
+    let model_path = format!("{path}.mpk");
+    println!("info: Modèle de validation sauvegardé : {model_path}");
+    recorder
+        .record(model.clone().into_record(), model_path.into())
+        .unwrap();
+}
+
+pub fn load_model(dense_size: usize, path: &String) -> Option<Net<NdArray<ElemType>>> {
+    let model_path = format!("{path}.mpk");
+    // println!("Chargement du modèle depuis : {model_path}");
+
+    CompactRecorder::new()
+        .load(model_path.into(), &NdArrayDevice::default())
+        .map(|record| {
+            Net::new(
+                <TrictracEnvironment as Environment>::StateType::size(),
+                dense_size,
+                <TrictracEnvironment as Environment>::ActionType::size(),
+            )
+            .load_record(record)
+        })
+        .ok()
 }
--- a/bot/src/burnrl/dqn/dqn_model.rs
+++ b/bot/src/burnrl/dqn/dqn_model.rs
@ -1,15 +1,16 @@
-use crate::burnrl::dqn::utils::soft_update_linear;
 use crate::burnrl::environment::TrictracEnvironment;
+use crate::burnrl::utils::{soft_update_linear, Config};
+use burn::backend::{ndarray::NdArrayDevice, NdArray};
 use burn::module::Module;
 use burn::nn::{Linear, LinearConfig};
 use burn::optim::AdamWConfig;
+use burn::record::{CompactRecorder, Recorder};
 use burn::tensor::activation::relu;
 use burn::tensor::backend::{AutodiffBackend, Backend};
 use burn::tensor::Tensor;
 use burn_rl::agent::DQN;
 use burn_rl::agent::{DQNModel, DQNTrainingConfig};
-use burn_rl::base::{Action, ElemType, Environment, Memory, Model, State};
-use std::fmt;
+use burn_rl::base::{Action, Agent, ElemType, Environment, Memory, Model, State};
 use std::time::SystemTime;

 #[derive(Module, Debug)]
@ -62,69 +63,18 @@ impl<B: Backend> DQNModel<B> for Net<B> {
 #[allow(unused)]
 const MEMORY_SIZE: usize = 8192;

-pub struct DqnConfig {
-    pub min_steps: f32,
-    pub max_steps: usize,
-    pub num_episodes: usize,
-    pub dense_size: usize,
-    pub eps_start: f64,
-    pub eps_end: f64,
-    pub eps_decay: f64,
-
-    pub gamma: f32,
-    pub tau: f32,
-    pub learning_rate: f32,
-    pub batch_size: usize,
-    pub clip_grad: f32,
-}
-
-impl fmt::Display for DqnConfig {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        let mut s = String::new();
-        s.push_str(&format!("min_steps={:?}\n", self.min_steps));
-        s.push_str(&format!("max_steps={:?}\n", self.max_steps));
-        s.push_str(&format!("num_episodes={:?}\n", self.num_episodes));
-        s.push_str(&format!("dense_size={:?}\n", self.dense_size));
-        s.push_str(&format!("eps_start={:?}\n", self.eps_start));
-        s.push_str(&format!("eps_end={:?}\n", self.eps_end));
-        s.push_str(&format!("eps_decay={:?}\n", self.eps_decay));
-        s.push_str(&format!("gamma={:?}\n", self.gamma));
-        s.push_str(&format!("tau={:?}\n", self.tau));
-        s.push_str(&format!("learning_rate={:?}\n", self.learning_rate));
-        s.push_str(&format!("batch_size={:?}\n", self.batch_size));
-        s.push_str(&format!("clip_grad={:?}\n", self.clip_grad));
-        write!(f, "{s}")
-    }
-}
-
-impl Default for DqnConfig {
-    fn default() -> Self {
-        Self {
-            min_steps: 250.0,
-            max_steps: 2000,
-            num_episodes: 1000,
-            dense_size: 256,
-            eps_start: 0.9,
-            eps_end: 0.05,
-            eps_decay: 1000.0,
-
-            gamma: 0.999,
-            tau: 0.005,
-            learning_rate: 0.001,
-            batch_size: 32,
-            clip_grad: 100.0,
-        }
-    }
-}
-
 type MyAgent<E, B> = DQN<E, B, Net<B>>;

 #[allow(unused)]
-pub fn run<E: Environment + AsMut<TrictracEnvironment>, B: AutodiffBackend>(
-    conf: &DqnConfig,
+// pub fn run<E: Environment + AsMut<TrictracEnvironment>, B: AutodiffBackend>(
+pub fn run<
+    E: Environment + AsMut<TrictracEnvironment>,
+    B: AutodiffBackend<InnerBackend = NdArray>,
+>(
+    conf: &Config,
    visualized: bool,
-) -> DQN<E, B, Net<B>> {
-    // ) -> impl Agent<E> {
+    // ) -> DQN<E, B, Net<B>> {
+) -> impl Agent<E> {
    let mut env = E::new(visualized);
    // env.as_mut().min_steps = conf.min_steps;
    env.as_mut().max_steps = conf.max_steps;
@ -203,7 +153,6 @@ pub fn run<E: Environment + AsMut<TrictracEnvironment>, B: AutodiffBackend>(
                    envmut.pointrolls_count,
                    now.elapsed().unwrap().as_secs(),
                );
-                if goodmoves_ratio < 5 && 10 < episode {}
                env.reset();
                episode_done = true;
                now = SystemTime::now();
@ -212,5 +161,35 @@ pub fn run<E: Environment + AsMut<TrictracEnvironment>, B: AutodiffBackend>(
            }
        }
    }
-    agent
+    let valid_agent = agent.valid();
+    if let Some(path) = &conf.save_path {
+        save_model(valid_agent.model().as_ref().unwrap(), path);
+    }
+    valid_agent
+}
+
+pub fn save_model(model: &Net<NdArray<ElemType>>, path: &String) {
+    let recorder = CompactRecorder::new();
+    let model_path = format!("{path}.mpk");
+    println!("info: Modèle de validation sauvegardé : {model_path}");
+    recorder
+        .record(model.clone().into_record(), model_path.into())
+        .unwrap();
+}
+
+pub fn load_model(dense_size: usize, path: &String) -> Option<Net<NdArray<ElemType>>> {
+    let model_path = format!("{path}.mpk");
+    // println!("Chargement du modèle depuis : {model_path}");
+
+    CompactRecorder::new()
+        .load(model_path.into(), &NdArrayDevice::default())
+        .map(|record| {
+            Net::new(
+                <TrictracEnvironment as Environment>::StateType::size(),
+                dense_size,
+                <TrictracEnvironment as Environment>::ActionType::size(),
+            )
+            .load_record(record)
+        })
+        .ok()
 }
--- a/bot/src/burnrl/dqn_valid/main.rs
+++ b/bot/src/burnrl/dqn_valid/main.rs
@ -1,53 +0,0 @@
-use bot::burnrl::dqn_valid::{
-    dqn_model,
-    utils::{demo_model, load_model, save_model},
-};
-use bot::burnrl::environment;
-use burn::backend::{Autodiff, NdArray};
-use burn_rl::agent::DQN;
-use burn_rl::base::ElemType;
-
-type Backend = Autodiff<NdArray<ElemType>>;
-type Env = environment::TrictracEnvironment;
-
-fn main() {
-    // println!("> Entraînement");
-
-    // See also MEMORY_SIZE in dqn_model.rs : 8192
-    let conf = dqn_model::DqnConfig {
-        //                   defaults
-        num_episodes: 100, // 40
-        max_steps: 1000,   // 1000 max steps by episode
-        dense_size: 256,   // 128  neural network complexity (default 128)
-        eps_start: 0.9,    // 0.9  epsilon initial value (0.9 => more exploration)
-        eps_end: 0.05,     // 0.05
-        // eps_decay higher = epsilon decrease slower
-        // used in : epsilon = eps_end + (eps_start - eps_end) * e^(-step / eps_decay);
-        // epsilon is updated at the start of each episode
-        eps_decay: 2000.0, // 1000 ?
-
-        gamma: 0.999, // 0.999 discount factor. Plus élevé = encourage stratégies à long terme
-        tau: 0.005, // 0.005 soft update rate. Taux de mise à jour du réseau cible. Plus bas = adaptation
-        // plus lente moins sensible aux coups de chance
-        learning_rate: 0.001, // 0.001 taille du pas. Bas : plus lent, haut : risque de ne jamais
-        // converger
-        batch_size: 32, // 32 nombre d'expériences passées sur lesquelles pour calcul de l'erreur moy.
-        clip_grad: 100.0, // 100 limite max de correction à apporter au gradient (default 100)
-    };
-    println!("{conf}----------");
-    let agent = dqn_model::run::<Env, Backend>(&conf, false); //true);
-
-    let valid_agent = agent.valid();
-
-    println!("> Sauvegarde du modèle de validation");
-
-    let path = "bot/models/burn_dqn_valid_40".to_string();
-    save_model(valid_agent.model().as_ref().unwrap(), &path);
-
-    println!("> Chargement du modèle pour test");
-    let loaded_model = load_model(conf.dense_size, &path);
-    let loaded_agent = DQN::new(loaded_model.unwrap());
-
-    println!("> Test avec le modèle chargé");
-    demo_model(loaded_agent);
-}
--- a/bot/src/burnrl/dqn_valid/mod.rs
+++ b/bot/src/burnrl/dqn_valid/mod.rs
@ -1,2 +0,0 @@
-pub mod dqn_model;
-pub mod utils;
--- a/bot/src/burnrl/dqn_valid/utils.rs
+++ b/bot/src/burnrl/dqn_valid/utils.rs
@ -1,112 +0,0 @@
-use crate::burnrl::dqn_valid::dqn_model;
-use crate::burnrl::environment_valid::{TrictracAction, TrictracEnvironment};
-use crate::training_common::get_valid_action_indices;
-use burn::backend::{ndarray::NdArrayDevice, NdArray};
-use burn::module::{Module, Param, ParamId};
-use burn::nn::Linear;
-use burn::record::{CompactRecorder, Recorder};
-use burn::tensor::backend::Backend;
-use burn::tensor::cast::ToElement;
-use burn::tensor::Tensor;
-use burn_rl::agent::{DQNModel, DQN};
-use burn_rl::base::{Action, ElemType, Environment, State};
-
-pub fn save_model(model: &dqn_model::Net<NdArray<ElemType>>, path: &String) {
-    let recorder = CompactRecorder::new();
-    let model_path = format!("{path}_model.mpk");
-    println!("Modèle de validation sauvegardé : {model_path}");
-    recorder
-        .record(model.clone().into_record(), model_path.into())
-        .unwrap();
-}
-
-pub fn load_model(dense_size: usize, path: &String) -> Option<dqn_model::Net<NdArray<ElemType>>> {
-    let model_path = format!("{path}_model.mpk");
-    // println!("Chargement du modèle depuis : {model_path}");
-
-    CompactRecorder::new()
-        .load(model_path.into(), &NdArrayDevice::default())
-        .map(|record| {
-            dqn_model::Net::new(
-                <TrictracEnvironment as Environment>::StateType::size(),
-                dense_size,
-                <TrictracEnvironment as Environment>::ActionType::size(),
-            )
-            .load_record(record)
-        })
-        .ok()
-}
-
-pub fn demo_model<B: Backend, M: DQNModel<B>>(agent: DQN<TrictracEnvironment, B, M>) {
-    let mut env = TrictracEnvironment::new(true);
-    let mut done = false;
-    while !done {
-        // let action = match infer_action(&agent, &env, state) {
-        let action = match infer_action(&agent, &env) {
-            Some(value) => value,
-            None => break,
-        };
-        // Execute action
-        let snapshot = env.step(action);
-        done = snapshot.done();
-    }
-}
-
-fn infer_action<B: Backend, M: DQNModel<B>>(
-    agent: &DQN<TrictracEnvironment, B, M>,
-    env: &TrictracEnvironment,
-) -> Option<TrictracAction> {
-    let state = env.state();
-    // Get q-values
-    let q_values = agent
-        .model()
-        .as_ref()
-        .unwrap()
-        .infer(state.to_tensor().unsqueeze());
-    // Get valid actions
-    let valid_actions_indices = get_valid_action_indices(&env.game);
-    if valid_actions_indices.is_empty() {
-        return None; // No valid actions, end of episode
-    }
-    // Set non valid actions q-values to lowest
-    let mut masked_q_values = q_values.clone();
-    let q_values_vec: Vec<f32> = q_values.into_data().into_vec().unwrap();
-    for (index, q_value) in q_values_vec.iter().enumerate() {
-        if !valid_actions_indices.contains(&index) {
-            masked_q_values = masked_q_values.clone().mask_fill(
-                masked_q_values.clone().equal_elem(*q_value),
-                f32::NEG_INFINITY,
-            );
-        }
-    }
-    // Get best action (highest q-value)
-    let action_index = masked_q_values.argmax(1).into_scalar().to_u32();
-    let action = TrictracAction::from(action_index);
-    Some(action)
-}
-
-fn soft_update_tensor<const N: usize, B: Backend>(
-    this: &Param<Tensor<B, N>>,
-    that: &Param<Tensor<B, N>>,
-    tau: ElemType,
-) -> Param<Tensor<B, N>> {
-    let that_weight = that.val();
-    let this_weight = this.val();
-    let new_weight = this_weight * (1.0 - tau) + that_weight * tau;
-
-    Param::initialized(ParamId::new(), new_weight)
-}
-
-pub fn soft_update_linear<B: Backend>(
-    this: Linear<B>,
-    that: &Linear<B>,
-    tau: ElemType,
-) -> Linear<B> {
-    let weight = soft_update_tensor(&this.weight, &that.weight, tau);
-    let bias = match (&this.bias, &that.bias) {
-        (Some(this_bias), Some(that_bias)) => Some(soft_update_tensor(this_bias, that_bias, tau)),
-        _ => None,
-    };
-
-    Linear::<B> { weight, bias }
-}
--- a/bot/src/burnrl/dqn_big/dqn_model.rs
+++ b/bot/src/burnrl/dqn_big/dqn_model.rs
@ -1,15 +1,16 @@
-use crate::burnrl::dqn_big::utils::soft_update_linear;
-use crate::burnrl::environment_big::TrictracEnvironment;
+use crate::burnrl::environment_valid::TrictracEnvironment;
+use crate::burnrl::utils::{soft_update_linear, Config};
+use burn::backend::{ndarray::NdArrayDevice, NdArray};
 use burn::module::Module;
 use burn::nn::{Linear, LinearConfig};
 use burn::optim::AdamWConfig;
+use burn::record::{CompactRecorder, Recorder};
 use burn::tensor::activation::relu;
 use burn::tensor::backend::{AutodiffBackend, Backend};
 use burn::tensor::Tensor;
 use burn_rl::agent::DQN;
 use burn_rl::agent::{DQNModel, DQNTrainingConfig};
-use burn_rl::base::{Action, ElemType, Environment, Memory, Model, State};
-use std::fmt;
+use burn_rl::base::{Action, Agent, ElemType, Environment, Memory, Model, State};
 use std::time::SystemTime;

 #[derive(Module, Debug)]
@ -62,71 +63,19 @@ impl<B: Backend> DQNModel<B> for Net<B> {
 #[allow(unused)]
 const MEMORY_SIZE: usize = 8192;

-pub struct DqnConfig {
-    pub min_steps: f32,
-    pub max_steps: usize,
-    pub num_episodes: usize,
-    pub dense_size: usize,
-    pub eps_start: f64,
-    pub eps_end: f64,
-    pub eps_decay: f64,
-
-    pub gamma: f32,
-    pub tau: f32,
-    pub learning_rate: f32,
-    pub batch_size: usize,
-    pub clip_grad: f32,
-}
-
-impl fmt::Display for DqnConfig {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        let mut s = String::new();
-        s.push_str(&format!("min_steps={:?}\n", self.min_steps));
-        s.push_str(&format!("max_steps={:?}\n", self.max_steps));
-        s.push_str(&format!("num_episodes={:?}\n", self.num_episodes));
-        s.push_str(&format!("dense_size={:?}\n", self.dense_size));
-        s.push_str(&format!("eps_start={:?}\n", self.eps_start));
-        s.push_str(&format!("eps_end={:?}\n", self.eps_end));
-        s.push_str(&format!("eps_decay={:?}\n", self.eps_decay));
-        s.push_str(&format!("gamma={:?}\n", self.gamma));
-        s.push_str(&format!("tau={:?}\n", self.tau));
-        s.push_str(&format!("learning_rate={:?}\n", self.learning_rate));
-        s.push_str(&format!("batch_size={:?}\n", self.batch_size));
-        s.push_str(&format!("clip_grad={:?}\n", self.clip_grad));
-        write!(f, "{s}")
-    }
-}
-
-impl Default for DqnConfig {
-    fn default() -> Self {
-        Self {
-            min_steps: 250.0,
-            max_steps: 2000,
-            num_episodes: 1000,
-            dense_size: 256,
-            eps_start: 0.9,
-            eps_end: 0.05,
-            eps_decay: 1000.0,
-
-            gamma: 0.999,
-            tau: 0.005,
-            learning_rate: 0.001,
-            batch_size: 32,
-            clip_grad: 100.0,
-        }
-    }
-}
-
 type MyAgent<E, B> = DQN<E, B, Net<B>>;

 #[allow(unused)]
-pub fn run<E: Environment + AsMut<TrictracEnvironment>, B: AutodiffBackend>(
-    conf: &DqnConfig,
+// pub fn run<E: Environment + AsMut<TrictracEnvironment>, B: AutodiffBackend>(
+pub fn run<
+    E: Environment + AsMut<TrictracEnvironment>,
+    B: AutodiffBackend<InnerBackend = NdArray>,
+>(
+    conf: &Config,
    visualized: bool,
-) -> DQN<E, B, Net<B>> {
-    // ) -> impl Agent<E> {
+    // ) -> DQN<E, B, Net<B>> {
+) -> impl Agent<E> {
    let mut env = E::new(visualized);
-    env.as_mut().min_steps = conf.min_steps;
    env.as_mut().max_steps = conf.max_steps;

    let model = Net::<B>::new(
@ -194,8 +143,7 @@ pub fn run<E: Environment + AsMut<TrictracEnvironment>, B: AutodiffBackend>(
            if snapshot.done() || episode_duration >= conf.max_steps {
                let envmut = env.as_mut();
                println!(
-                    "{{\"episode\": {episode}, \"reward\": {episode_reward:.4}, \"steps count\": {episode_duration}, \"epsilon\": {eps_threshold:.3}, \"goodmoves\": {}, \"rollpoints\":{}, \"duration\": {}}}",
-                    envmut.goodmoves_count,
+                    "{{\"episode\": {episode}, \"reward\": {episode_reward:.4}, \"steps count\": {episode_duration}, \"epsilon\": {eps_threshold:.3},  \"rollpoints\":{}, \"duration\": {}}}",
                    envmut.pointrolls_count,
                    now.elapsed().unwrap().as_secs(),
                );
@ -207,5 +155,35 @@ pub fn run<E: Environment + AsMut<TrictracEnvironment>, B: AutodiffBackend>(
            }
        }
    }
-    agent
+    let valid_agent = agent.valid();
+    if let Some(path) = &conf.save_path {
+        save_model(valid_agent.model().as_ref().unwrap(), path);
+    }
+    valid_agent
+}
+
+pub fn save_model(model: &Net<NdArray<ElemType>>, path: &String) {
+    let recorder = CompactRecorder::new();
+    let model_path = format!("{path}.mpk");
+    println!("info: Modèle de validation sauvegardé : {model_path}");
+    recorder
+        .record(model.clone().into_record(), model_path.into())
+        .unwrap();
+}
+
+pub fn load_model(dense_size: usize, path: &String) -> Option<Net<NdArray<ElemType>>> {
+    let model_path = format!("{path}.mpk");
+    // println!("Chargement du modèle depuis : {model_path}");
+
+    CompactRecorder::new()
+        .load(model_path.into(), &NdArrayDevice::default())
+        .map(|record| {
+            Net::new(
+                <TrictracEnvironment as Environment>::StateType::size(),
+                dense_size,
+                <TrictracEnvironment as Environment>::ActionType::size(),
+            )
+            .load_record(record)
+        })
+        .ok()
 }
--- a/bot/src/burnrl/environment.rs
+++ b/bot/src/burnrl/environment.rs
@ -139,6 +139,7 @@ impl Environment for TrictracEnvironment {

    fn reset(&mut self) -> Snapshot<Self> {
        // Réinitialiser le jeu
+        let history = self.game.history.clone();
        self.game = GameState::new(false);
        self.game.init_player("DQN Agent");
        self.game.init_player("Opponent");
@ -157,18 +158,18 @@ impl Environment for TrictracEnvironment {
        let warning = if self.best_ratio > 0.7 && self.goodmoves_ratio < 0.1 {
            let path = "bot/models/logs/debug.log";
            if let Ok(mut out) = std::fs::File::create(path) {
-                write!(out, "{:?}", self.game.history);
+                write!(out, "{:?}", history);
            }
            "!!!!"
        } else {
            ""
        };
-        println!(
-            "info: correct moves: {} ({}%) {}",
-            self.goodmoves_count,
-            (100.0 * self.goodmoves_ratio).round() as u32,
-            warning
-        );
+        // println!(
+        //     "info: correct moves: {} ({}%) {}",
+        //     self.goodmoves_count,
+        //     (100.0 * self.goodmoves_ratio).round() as u32,
+        //     warning
+        // );
        self.step_count = 0;
        self.pointrolls_count = 0;
        self.goodmoves_count = 0;
@ -369,7 +370,7 @@ impl TrictracEnvironment {
                    if self.game.validate(&dice_event) {
                        self.game.consume(&dice_event);
                        let (points, adv_points) = self.game.dice_points;
-                        reward += REWARD_RATIO * (points - adv_points) as f32;
+                        reward += REWARD_RATIO * (points as f32 - adv_points as f32);
                        if points > 0 {
                            is_rollpoint = true;
                            // println!("info: rolled for {reward}");
@ -479,7 +480,7 @@ impl TrictracEnvironment {
                        PointsRules::new(&opponent_color, &self.game.board, self.game.dice);
                    let (points, adv_points) = points_rules.get_points(dice_roll_count);
                    // Récompense proportionnelle aux points
-                    reward -= REWARD_RATIO * (points - adv_points) as f32;
+                    reward -= REWARD_RATIO * (points as f32 - adv_points as f32);
                }
            }
        }
--- a/bot/src/burnrl/environment_big.rs
+++ b/bot/src/burnrl/environment_big.rs
@ -89,7 +89,6 @@ pub struct TrictracEnvironment {
    current_state: TrictracState,
    episode_reward: f32,
    pub step_count: usize,
-    pub min_steps: f32,
    pub max_steps: usize,
    pub pointrolls_count: usize,
    pub goodmoves_count: usize,
@ -122,7 +121,6 @@ impl Environment for TrictracEnvironment {
            current_state,
            episode_reward: 0.0,
            step_count: 0,
-            min_steps: 250.0,
            max_steps: 2000,
            pointrolls_count: 0,
            goodmoves_count: 0,
@ -196,9 +194,10 @@ impl Environment for TrictracEnvironment {
        }

        // Vérifier si la partie est terminée
-        let max_steps = self.min_steps
-            + (self.max_steps as f32 - self.min_steps)
-                * f32::exp((self.goodmoves_ratio - 1.0) / 0.25);
+        // let max_steps = self.max_steps
+        // let max_steps = self.min_steps
+        //     + (self.max_steps as f32 - self.min_steps)
+        //         * f32::exp((self.goodmoves_ratio - 1.0) / 0.25);
        let done = self.game.stage == Stage::Ended || self.game.determine_winner().is_some();

        if done {
@ -211,7 +210,7 @@ impl Environment for TrictracEnvironment {
                }
            }
        }
-        let terminated = done || self.step_count >= max_steps.round() as usize;
+        let terminated = done || self.step_count >= self.max_steps;

        // Mettre à jour l'état
        self.current_state = TrictracState::from_game_state(&self.game);
--- a/bot/src/burnrl/main.rs
+++ b/bot/src/burnrl/main.rs
@ -0,0 +1,58 @@
+use bot::burnrl::sac_model as burn_model;
+// use bot::burnrl::dqn_big_model as burn_model;
+// use bot::burnrl::dqn_model as burn_model;
+// use bot::burnrl::environment_big::TrictracEnvironment;
+use bot::burnrl::environment::TrictracEnvironment;
+use bot::burnrl::utils::{demo_model, Config};
+use burn::backend::{Autodiff, NdArray};
+use burn_rl::agent::SAC as MyAgent;
+// use burn_rl::agent::DQN as MyAgent;
+use burn_rl::base::ElemType;
+
+type Backend = Autodiff<NdArray<ElemType>>;
+type Env = TrictracEnvironment;
+
+fn main() {
+    let path = "bot/models/burnrl_dqn".to_string();
+    let conf = Config {
+        save_path: Some(path.clone()),
+        num_episodes: 30, // 40
+        max_steps: 1000,  // 1000 max steps by episode
+        dense_size: 256,  // 128  neural network complexity (default 128)
+
+        gamma: 0.9999, // 0.999 discount factor. Plus élevé = encourage stratégies à long terme
+        tau: 0.0005, // 0.005 soft update rate. Taux de mise à jour du réseau cible. Plus bas = adaptation
+        // plus lente moins sensible aux coups de chance
+        learning_rate: 0.001, // 0.001 taille du pas. Bas : plus lent, haut : risque de ne jamais
+        // converger
+        batch_size: 128, // 32 nombre d'expériences passées sur lesquelles pour calcul de l'erreur moy.
+        clip_grad: 70.0, // 100 limite max de correction à apporter au gradient (default 100)
+
+        min_probability: 1e-9,
+
+        eps_start: 0.9, // 0.9  epsilon initial value (0.9 => more exploration)
+        eps_end: 0.05,  // 0.05
+        // eps_decay higher = epsilon decrease slower
+        // used in : epsilon = eps_end + (eps_start - eps_end) * e^(-step / eps_decay);
+        // epsilon is updated at the start of each episode
+        eps_decay: 2000.0, // 1000 ?
+
+        lambda: 0.95,
+        epsilon_clip: 0.2,
+        critic_weight: 0.5,
+        entropy_weight: 0.01,
+        epochs: 8,
+    };
+    println!("{conf}----------");
+
+    let agent = burn_model::run::<Env, Backend>(&conf, false); //true);
+
+    // println!("> Chargement du modèle pour test");
+    // let loaded_model = burn_model::load_model(conf.dense_size, &path);
+    // let loaded_agent: MyAgent<Env, _, _> = MyAgent::new(loaded_model.unwrap());
+    //
+    // println!("> Test avec le modèle chargé");
+    // demo_model(loaded_agent);
+
+    // demo_model::<Env>(agent);
+}
--- a/bot/src/burnrl/mod.rs
+++ b/bot/src/burnrl/mod.rs
@ -1,8 +1,9 @@
-pub mod dqn;
-pub mod dqn_big;
-pub mod dqn_valid;
+pub mod dqn_big_model;
+pub mod dqn_model;
+pub mod dqn_valid_model;
 pub mod environment;
 pub mod environment_big;
 pub mod environment_valid;
-pub mod ppo;
-pub mod sac;
+pub mod ppo_model;
+pub mod sac_model;
+pub mod utils;
--- a/bot/src/burnrl/ppo/main.rs
+++ b/bot/src/burnrl/ppo/main.rs
@ -1,52 +0,0 @@
-use bot::burnrl::environment;
-use bot::burnrl::ppo::{
-    ppo_model,
-    utils::{demo_model, load_model, save_model},
-};
-use burn::backend::{Autodiff, NdArray};
-use burn_rl::agent::PPO;
-use burn_rl::base::ElemType;
-
-type Backend = Autodiff<NdArray<ElemType>>;
-type Env = environment::TrictracEnvironment;
-
-fn main() {
-    // println!("> Entraînement");
-
-    // See also MEMORY_SIZE in ppo_model.rs : 8192
-    let conf = ppo_model::PpoConfig {
-        //                   defaults
-        num_episodes: 50, // 40
-        max_steps: 1000,  // 1000 max steps by episode
-        dense_size: 128,  // 128  neural network complexity (default 128)
-        gamma: 0.999,     // 0.999 discount factor. Plus élevé = encourage stratégies à long terme
-        // plus lente moins sensible aux coups de chance
-        learning_rate: 0.001, // 0.001 taille du pas. Bas : plus lent, haut : risque de ne jamais
-        // converger
-        batch_size: 128, // 32 nombre d'expériences passées sur lesquelles pour calcul de l'erreur moy.
-        clip_grad: 100.0, // 100 limite max de correction à apporter au gradient (default 100)
-
-        lambda: 0.95,
-        epsilon_clip: 0.2,
-        critic_weight: 0.5,
-        entropy_weight: 0.01,
-        epochs: 8,
-    };
-    println!("{conf}----------");
-    let valid_agent = ppo_model::run::<Env, Backend>(&conf, false); //true);
-
-    // let valid_agent = agent.valid(model);
-
-    println!("> Sauvegarde du modèle de validation");
-
-    let path = "bot/models/burnrl_ppo".to_string();
-    panic!("how to do that  : save model");
-    // save_model(valid_agent.model().as_ref().unwrap(), &path);
-
-    // println!("> Chargement du modèle pour test");
-    // let loaded_model = load_model(conf.dense_size, &path);
-    // let loaded_agent = PPO::new(loaded_model.unwrap());
-    //
-    // println!("> Test avec le modèle chargé");
-    // demo_model(loaded_agent);
-}
--- a/bot/src/burnrl/ppo/mod.rs
+++ b/bot/src/burnrl/ppo/mod.rs
@ -1,2 +0,0 @@
-pub mod ppo_model;
-pub mod utils;
--- a/bot/src/burnrl/ppo/utils.rs
+++ b/bot/src/burnrl/ppo/utils.rs
@ -1,88 +0,0 @@
-use crate::burnrl::environment::{TrictracAction, TrictracEnvironment};
-use crate::burnrl::ppo::ppo_model;
-use crate::training_common::get_valid_action_indices;
-use burn::backend::{ndarray::NdArrayDevice, NdArray};
-use burn::module::{Module, Param, ParamId};
-use burn::nn::Linear;
-use burn::record::{CompactRecorder, Recorder};
-use burn::tensor::backend::Backend;
-use burn::tensor::cast::ToElement;
-use burn::tensor::Tensor;
-use burn_rl::agent::{PPOModel, PPO};
-use burn_rl::base::{Action, ElemType, Environment, State};
-
-pub fn save_model(model: &ppo_model::Net<NdArray<ElemType>>, path: &String) {
-    let recorder = CompactRecorder::new();
-    let model_path = format!("{path}.mpk");
-    println!("Modèle de validation sauvegardé : {model_path}");
-    recorder
-        .record(model.clone().into_record(), model_path.into())
-        .unwrap();
-}
-
-pub fn load_model(dense_size: usize, path: &String) -> Option<ppo_model::Net<NdArray<ElemType>>> {
-    let model_path = format!("{path}.mpk");
-    // println!("Chargement du modèle depuis : {model_path}");
-
-    CompactRecorder::new()
-        .load(model_path.into(), &NdArrayDevice::default())
-        .map(|record| {
-            ppo_model::Net::new(
-                <TrictracEnvironment as Environment>::StateType::size(),
-                dense_size,
-                <TrictracEnvironment as Environment>::ActionType::size(),
-            )
-            .load_record(record)
-        })
-        .ok()
-}
-
-pub fn demo_model<B: Backend, M: PPOModel<B>>(agent: PPO<TrictracEnvironment, B, M>) {
-    let mut env = TrictracEnvironment::new(true);
-    let mut done = false;
-    while !done {
-        // let action = match infer_action(&agent, &env, state) {
-        let action = match infer_action(&agent, &env) {
-            Some(value) => value,
-            None => break,
-        };
-        // Execute action
-        let snapshot = env.step(action);
-        done = snapshot.done();
-    }
-}
-
-fn infer_action<B: Backend, M: PPOModel<B>>(
-    agent: &PPO<TrictracEnvironment, B, M>,
-    env: &TrictracEnvironment,
-) -> Option<TrictracAction> {
-    let state = env.state();
-    panic!("how to do that ?");
-    None
-    // Get q-values
-    // let q_values = agent
-    //     .model()
-    //     .as_ref()
-    //     .unwrap()
-    //     .infer(state.to_tensor().unsqueeze());
-    // // Get valid actions
-    // let valid_actions_indices = get_valid_action_indices(&env.game);
-    // if valid_actions_indices.is_empty() {
-    //     return None; // No valid actions, end of episode
-    // }
-    // // Set non valid actions q-values to lowest
-    // let mut masked_q_values = q_values.clone();
-    // let q_values_vec: Vec<f32> = q_values.into_data().into_vec().unwrap();
-    // for (index, q_value) in q_values_vec.iter().enumerate() {
-    //     if !valid_actions_indices.contains(&index) {
-    //         masked_q_values = masked_q_values.clone().mask_fill(
-    //             masked_q_values.clone().equal_elem(*q_value),
-    //             f32::NEG_INFINITY,
-    //         );
-    //     }
-    // }
-    // // Get best action (highest q-value)
-    // let action_index = masked_q_values.argmax(1).into_scalar().to_u32();
-    // let action = TrictracAction::from(action_index);
-    // Some(action)
-}
--- a/bot/src/burnrl/ppo/ppo_model.rs
+++ b/bot/src/burnrl/ppo/ppo_model.rs
@ -1,4 +1,5 @@
 use crate::burnrl::environment::TrictracEnvironment;
+use crate::burnrl::utils::Config;
 use burn::module::Module;
 use burn::nn::{Initializer, Linear, LinearConfig};
 use burn::optim::AdamWConfig;
@ -7,7 +8,6 @@ use burn::tensor::backend::{AutodiffBackend, Backend};
 use burn::tensor::Tensor;
 use burn_rl::agent::{PPOModel, PPOOutput, PPOTrainingConfig, PPO};
 use burn_rl::base::{Action, Agent, ElemType, Environment, Memory, Model, State};
-use std::fmt;
 use std::time::SystemTime;

 #[derive(Module, Debug)]
@ -54,64 +54,11 @@ impl<B: Backend> PPOModel<B> for Net<B> {}
 #[allow(unused)]
 const MEMORY_SIZE: usize = 512;

-pub struct PpoConfig {
-    pub max_steps: usize,
-    pub num_episodes: usize,
-    pub dense_size: usize,
-
-    pub gamma: f32,
-    pub lambda: f32,
-    pub epsilon_clip: f32,
-    pub critic_weight: f32,
-    pub entropy_weight: f32,
-    pub learning_rate: f32,
-    pub epochs: usize,
-    pub batch_size: usize,
-    pub clip_grad: f32,
-}
-
-impl fmt::Display for PpoConfig {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        let mut s = String::new();
-        s.push_str(&format!("max_steps={:?}\n", self.max_steps));
-        s.push_str(&format!("num_episodes={:?}\n", self.num_episodes));
-        s.push_str(&format!("dense_size={:?}\n", self.dense_size));
-        s.push_str(&format!("gamma={:?}\n", self.gamma));
-        s.push_str(&format!("lambda={:?}\n", self.lambda));
-        s.push_str(&format!("epsilon_clip={:?}\n", self.epsilon_clip));
-        s.push_str(&format!("critic_weight={:?}\n", self.critic_weight));
-        s.push_str(&format!("entropy_weight={:?}\n", self.entropy_weight));
-        s.push_str(&format!("learning_rate={:?}\n", self.learning_rate));
-        s.push_str(&format!("epochs={:?}\n", self.epochs));
-        s.push_str(&format!("batch_size={:?}\n", self.batch_size));
-        write!(f, "{s}")
-    }
-}
-
-impl Default for PpoConfig {
-    fn default() -> Self {
-        Self {
-            max_steps: 2000,
-            num_episodes: 1000,
-            dense_size: 256,
-
-            gamma: 0.99,
-            lambda: 0.95,
-            epsilon_clip: 0.2,
-            critic_weight: 0.5,
-            entropy_weight: 0.01,
-            learning_rate: 0.001,
-            epochs: 8,
-            batch_size: 8,
-            clip_grad: 100.0,
-        }
-    }
-}
 type MyAgent<E, B> = PPO<E, B, Net<B>>;

 #[allow(unused)]
 pub fn run<E: Environment + AsMut<TrictracEnvironment>, B: AutodiffBackend>(
-    conf: &PpoConfig,
+    conf: &Config,
    visualized: bool,
    // ) -> PPO<E, B, Net<B>> {
 ) -> impl Agent<E> {
@ -179,6 +126,9 @@ pub fn run<E: Environment + AsMut<TrictracEnvironment>, B: AutodiffBackend>(
        memory.clear();
    }

-    agent.valid(model)
-    // agent
+    let valid_agent = agent.valid(model);
+    if let Some(path) = &conf.save_path {
+        // save_model(???, path);
+    }
+    valid_agent
 }
--- a/bot/src/burnrl/sac/main.rs
+++ b/bot/src/burnrl/sac/main.rs
@ -1,45 +0,0 @@
-use bot::burnrl::environment;
-use bot::burnrl::sac::{sac_model, utils::demo_model};
-use burn::backend::{Autodiff, NdArray};
-use burn_rl::agent::SAC;
-use burn_rl::base::ElemType;
-
-type Backend = Autodiff<NdArray<ElemType>>;
-type Env = environment::TrictracEnvironment;
-
-fn main() {
-    // println!("> Entraînement");
-
-    // See also MEMORY_SIZE in dqn_model.rs : 8192
-    let conf = sac_model::SacConfig {
-        //                   defaults
-        num_episodes: 50, // 40
-        max_steps: 1000,  // 1000 max steps by episode
-        dense_size: 256,  // 128  neural network complexity (default 128)
-
-        gamma: 0.999, // 0.999 discount factor. Plus élevé = encourage stratégies à long terme
-        tau: 0.005, // 0.005 soft update rate. Taux de mise à jour du réseau cible. Plus bas = adaptation
-        // plus lente moins sensible aux coups de chance
-        learning_rate: 0.001, // 0.001 taille du pas. Bas : plus lent, haut : risque de ne jamais
-        // converger
-        batch_size: 32, // 32 nombre d'expériences passées sur lesquelles pour calcul de l'erreur moy.
-        clip_grad: 1.0, // 1.0 limite max de correction à apporter au gradient
-        min_probability: 1e-9,
-    };
-    println!("{conf}----------");
-    let valid_agent = sac_model::run::<Env, Backend>(&conf, false); //true);
-
-    // let valid_agent = agent.valid();
-
-    // println!("> Sauvegarde du modèle de validation");
-    //
-    // let path = "bot/models/burnrl_dqn".to_string();
-    // save_model(valid_agent.model().as_ref().unwrap(), &path);
-    //
-    // println!("> Chargement du modèle pour test");
-    // let loaded_model = load_model(conf.dense_size, &path);
-    // let loaded_agent = DQN::new(loaded_model.unwrap());
-    //
-    // println!("> Test avec le modèle chargé");
-    // demo_model(loaded_agent);
-}
--- a/bot/src/burnrl/sac/mod.rs
+++ b/bot/src/burnrl/sac/mod.rs
@ -1,2 +0,0 @@
-pub mod sac_model;
-pub mod utils;
--- a/bot/src/burnrl/sac/utils.rs
+++ b/bot/src/burnrl/sac/utils.rs
@ -1,78 +0,0 @@
-use crate::burnrl::environment::{TrictracAction, TrictracEnvironment};
-use crate::burnrl::sac::sac_model;
-use crate::training_common::get_valid_action_indices;
-use burn::backend::{ndarray::NdArrayDevice, NdArray};
-use burn::module::{Module, Param, ParamId};
-use burn::nn::Linear;
-use burn::record::{CompactRecorder, Recorder};
-use burn::tensor::backend::Backend;
-use burn::tensor::cast::ToElement;
-use burn::tensor::Tensor;
-// use burn_rl::agent::{SACModel, SAC};
-use burn_rl::base::{Agent, ElemType, Environment};
-
-// pub fn save_model(model: &sac_model::Net<NdArray<ElemType>>, path: &String) {
-//     let recorder = CompactRecorder::new();
-//     let model_path = format!("{path}.mpk");
-//     println!("Modèle de validation sauvegardé : {model_path}");
-//     recorder
-//         .record(model.clone().into_record(), model_path.into())
-//         .unwrap();
-// }
-//
-// pub fn load_model(dense_size: usize, path: &String) -> Option<sac_model::Net<NdArray<ElemType>>> {
-//     let model_path = format!("{path}.mpk");
-//     // println!("Chargement du modèle depuis : {model_path}");
-//
-//     CompactRecorder::new()
-//         .load(model_path.into(), &NdArrayDevice::default())
-//         .map(|record| {
-//             dqn_model::Net::new(
-//                 <TrictracEnvironment as Environment>::StateType::size(),
-//                 dense_size,
-//                 <TrictracEnvironment as Environment>::ActionType::size(),
-//             )
-//             .load_record(record)
-//         })
-//         .ok()
-// }
-//
-
-pub fn demo_model<E: Environment>(agent: impl Agent<E>) {
-    let mut env = E::new(true);
-    let mut state = env.state();
-    let mut done = false;
-    while !done {
-        if let Some(action) = agent.react(&state) {
-            let snapshot = env.step(action);
-            state = *snapshot.state();
-            done = snapshot.done();
-        }
-    }
-}
-
-fn soft_update_tensor<const N: usize, B: Backend>(
-    this: &Param<Tensor<B, N>>,
-    that: &Param<Tensor<B, N>>,
-    tau: ElemType,
-) -> Param<Tensor<B, N>> {
-    let that_weight = that.val();
-    let this_weight = this.val();
-    let new_weight = this_weight * (1.0 - tau) + that_weight * tau;
-
-    Param::initialized(ParamId::new(), new_weight)
-}
-
-pub fn soft_update_linear<B: Backend>(
-    this: Linear<B>,
-    that: &Linear<B>,
-    tau: ElemType,
-) -> Linear<B> {
-    let weight = soft_update_tensor(&this.weight, &that.weight, tau);
-    let bias = match (&this.bias, &that.bias) {
-        (Some(this_bias), Some(that_bias)) => Some(soft_update_tensor(this_bias, that_bias, tau)),
-        _ => None,
-    };
-
-    Linear::<B> { weight, bias }
-}
--- a/bot/src/burnrl/sac/sac_model.rs
+++ b/bot/src/burnrl/sac/sac_model.rs
@ -1,14 +1,15 @@
 use crate::burnrl::environment::TrictracEnvironment;
-use crate::burnrl::sac::utils::soft_update_linear;
+use crate::burnrl::utils::{soft_update_linear, Config};
+use burn::backend::{ndarray::NdArrayDevice, NdArray};
 use burn::module::Module;
 use burn::nn::{Linear, LinearConfig};
 use burn::optim::AdamWConfig;
+use burn::record::{CompactRecorder, Recorder};
 use burn::tensor::activation::{relu, softmax};
 use burn::tensor::backend::{AutodiffBackend, Backend};
 use burn::tensor::Tensor;
 use burn_rl::agent::{SACActor, SACCritic, SACNets, SACOptimizer, SACTrainingConfig, SAC};
 use burn_rl::base::{Action, Agent, ElemType, Environment, Memory, Model, State};
-use std::fmt;
 use std::time::SystemTime;

 #[derive(Module, Debug)]
@ -92,57 +93,11 @@ impl<B: Backend> SACCritic<B> for Critic<B> {
 #[allow(unused)]
 const MEMORY_SIZE: usize = 4096;

-pub struct SacConfig {
-    pub max_steps: usize,
-    pub num_episodes: usize,
-    pub dense_size: usize,
-
-    pub gamma: f32,
-    pub tau: f32,
-    pub learning_rate: f32,
-    pub batch_size: usize,
-    pub clip_grad: f32,
-    pub min_probability: f32,
-}
-
-impl Default for SacConfig {
-    fn default() -> Self {
-        Self {
-            max_steps: 2000,
-            num_episodes: 1000,
-            dense_size: 32,
-
-            gamma: 0.999,
-            tau: 0.005,
-            learning_rate: 0.001,
-            batch_size: 32,
-            clip_grad: 1.0,
-            min_probability: 1e-9,
-        }
-    }
-}
-
-impl fmt::Display for SacConfig {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        let mut s = String::new();
-        s.push_str(&format!("max_steps={:?}\n", self.max_steps));
-        s.push_str(&format!("num_episodes={:?}\n", self.num_episodes));
-        s.push_str(&format!("dense_size={:?}\n", self.dense_size));
-        s.push_str(&format!("gamma={:?}\n", self.gamma));
-        s.push_str(&format!("tau={:?}\n", self.tau));
-        s.push_str(&format!("learning_rate={:?}\n", self.learning_rate));
-        s.push_str(&format!("batch_size={:?}\n", self.batch_size));
-        s.push_str(&format!("clip_grad={:?}\n", self.clip_grad));
-        s.push_str(&format!("min_probability={:?}\n", self.min_probability));
-        write!(f, "{s}")
-    }
-}
-
 type MyAgent<E, B> = SAC<E, B, Actor<B>>;

 #[allow(unused)]
 pub fn run<E: Environment + AsMut<TrictracEnvironment>, B: AutodiffBackend>(
-    conf: &SacConfig,
+    conf: &Config,
    visualized: bool,
 ) -> impl Agent<E> {
    let mut env = E::new(visualized);
@ -229,5 +184,35 @@ pub fn run<E: Environment + AsMut<TrictracEnvironment>, B: AutodiffBackend>(
        }
    }

-    agent.valid(nets.actor)
+    let valid_agent = agent.valid(nets.actor);
+    if let Some(path) = &conf.save_path {
+        // save_model(???, path);
    }
+    valid_agent
+}
+
+// pub fn save_model(model: ???, path: &String) {
+//     let recorder = CompactRecorder::new();
+//     let model_path = format!("{path}.mpk");
+//     println!("info: Modèle de validation sauvegardé : {model_path}");
+//     recorder
+//         .record(model.clone().into_record(), model_path.into())
+//         .unwrap();
+// }
+//
+// pub fn load_model(dense_size: usize, path: &String) -> Option<Actor<NdArray<ElemType>>> {
+//     let model_path = format!("{path}.mpk");
+//     // println!("Chargement du modèle depuis : {model_path}");
+//
+//     CompactRecorder::new()
+//         .load(model_path.into(), &NdArrayDevice::default())
+//         .map(|record| {
+//             Actor::new(
+//                 <TrictracEnvironment as Environment>::StateType::size(),
+//                 dense_size,
+//                 <TrictracEnvironment as Environment>::ActionType::size(),
+//             )
+//             .load_record(record)
+//         })
+//         .ok()
+// }
--- a/bot/src/burnrl/utils.rs
+++ b/bot/src/burnrl/utils.rs
@ -0,0 +1,121 @@
+use burn::module::{Param, ParamId};
+use burn::nn::Linear;
+use burn::tensor::backend::Backend;
+use burn::tensor::Tensor;
+use burn_rl::base::{Agent, ElemType, Environment};
+
+pub struct Config {
+    pub save_path: Option<String>,
+    pub max_steps: usize,
+    pub num_episodes: usize,
+    pub dense_size: usize,
+
+    pub gamma: f32,
+    pub tau: f32,
+    pub learning_rate: f32,
+    pub batch_size: usize,
+    pub clip_grad: f32,
+
+    // for SAC
+    pub min_probability: f32,
+
+    // for DQN
+    pub eps_start: f64,
+    pub eps_end: f64,
+    pub eps_decay: f64,
+
+    // for PPO
+    pub lambda: f32,
+    pub epsilon_clip: f32,
+    pub critic_weight: f32,
+    pub entropy_weight: f32,
+    pub epochs: usize,
+}
+
+impl Default for Config {
+    fn default() -> Self {
+        Self {
+            save_path: None,
+            max_steps: 2000,
+            num_episodes: 1000,
+            dense_size: 256,
+            gamma: 0.999,
+            tau: 0.005,
+            learning_rate: 0.001,
+            batch_size: 32,
+            clip_grad: 100.0,
+            min_probability: 1e-9,
+            eps_start: 0.9,
+            eps_end: 0.05,
+            eps_decay: 1000.0,
+            lambda: 0.95,
+            epsilon_clip: 0.2,
+            critic_weight: 0.5,
+            entropy_weight: 0.01,
+            epochs: 8,
+        }
+    }
+}
+
+impl std::fmt::Display for Config {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        let mut s = String::new();
+        s.push_str(&format!("max_steps={:?}\n", self.max_steps));
+        s.push_str(&format!("num_episodes={:?}\n", self.num_episodes));
+        s.push_str(&format!("dense_size={:?}\n", self.dense_size));
+        s.push_str(&format!("eps_start={:?}\n", self.eps_start));
+        s.push_str(&format!("eps_end={:?}\n", self.eps_end));
+        s.push_str(&format!("eps_decay={:?}\n", self.eps_decay));
+        s.push_str(&format!("gamma={:?}\n", self.gamma));
+        s.push_str(&format!("tau={:?}\n", self.tau));
+        s.push_str(&format!("learning_rate={:?}\n", self.learning_rate));
+        s.push_str(&format!("batch_size={:?}\n", self.batch_size));
+        s.push_str(&format!("clip_grad={:?}\n", self.clip_grad));
+        s.push_str(&format!("min_probability={:?}\n", self.min_probability));
+        s.push_str(&format!("lambda={:?}\n", self.lambda));
+        s.push_str(&format!("epsilon_clip={:?}\n", self.epsilon_clip));
+        s.push_str(&format!("critic_weight={:?}\n", self.critic_weight));
+        s.push_str(&format!("entropy_weight={:?}\n", self.entropy_weight));
+        s.push_str(&format!("epochs={:?}\n", self.epochs));
+        write!(f, "{s}")
+    }
+}
+
+pub fn demo_model<E: Environment>(agent: impl Agent<E>) {
+    let mut env = E::new(true);
+    let mut state = env.state();
+    let mut done = false;
+    while !done {
+        if let Some(action) = agent.react(&state) {
+            let snapshot = env.step(action);
+            state = *snapshot.state();
+            done = snapshot.done();
+        }
+    }
+}
+
+fn soft_update_tensor<const N: usize, B: Backend>(
+    this: &Param<Tensor<B, N>>,
+    that: &Param<Tensor<B, N>>,
+    tau: ElemType,
+) -> Param<Tensor<B, N>> {
+    let that_weight = that.val();
+    let this_weight = this.val();
+    let new_weight = this_weight * (1.0 - tau) + that_weight * tau;
+
+    Param::initialized(ParamId::new(), new_weight)
+}
+
+pub fn soft_update_linear<B: Backend>(
+    this: Linear<B>,
+    that: &Linear<B>,
+    tau: ElemType,
+) -> Linear<B> {
+    let weight = soft_update_tensor(&this.weight, &that.weight, tau);
+    let bias = match (&this.bias, &that.bias) {
+        (Some(this_bias), Some(that_bias)) => Some(soft_update_tensor(this_bias, that_bias, tau)),
+        _ => None,
+    };
+
+    Linear::<B> { weight, bias }
+}
--- a/bot/src/strategy/dqnburn.rs
+++ b/bot/src/strategy/dqnburn.rs
@ -6,8 +6,9 @@ use crate::{BotStrategy, CheckerMove, Color, GameState, PlayerId};
 use log::info;
 use store::MoveRules;

-use crate::burnrl::dqn::{dqn_model, utils};
+use crate::burnrl::dqn_model;
 use crate::burnrl::environment;
+use crate::burnrl::utils;
 use crate::training_common::{get_valid_action_indices, sample_valid_action, TrictracAction};

 type DqnBurnNetwork = dqn_model::Net<NdArray<ElemType>>;
@ -40,7 +41,7 @@ impl DqnBurnStrategy {
    pub fn new_with_model(model_path: &String) -> Self {
        info!("Loading model {model_path:?}");
        let mut strategy = Self::new();
-        strategy.model = utils::load_model(256, model_path);
+        strategy.model = dqn_model::load_model(256, model_path);
        strategy
    }