doc params train bot
This commit is contained in:
parent
778ac1817b
commit
4353ba2bd1
|
|
@ -164,6 +164,7 @@ pub fn run<E: Environment + AsMut<TrictracEnvironment>, B: AutodiffBackend>(
|
||||||
let mut episode_duration = 0_usize;
|
let mut episode_duration = 0_usize;
|
||||||
let mut state = env.state();
|
let mut state = env.state();
|
||||||
let mut now = SystemTime::now();
|
let mut now = SystemTime::now();
|
||||||
|
let mut goodmoves_ratio = 0.0;
|
||||||
|
|
||||||
while !episode_done {
|
while !episode_done {
|
||||||
let eps_threshold = conf.eps_end
|
let eps_threshold = conf.eps_end
|
||||||
|
|
@ -192,13 +193,17 @@ pub fn run<E: Environment + AsMut<TrictracEnvironment>, B: AutodiffBackend>(
|
||||||
episode_duration += 1;
|
episode_duration += 1;
|
||||||
|
|
||||||
if snapshot.done() || episode_duration >= conf.max_steps {
|
if snapshot.done() || episode_duration >= conf.max_steps {
|
||||||
env.reset();
|
let envmut = env.as_mut();
|
||||||
episode_done = true;
|
|
||||||
|
|
||||||
println!(
|
println!(
|
||||||
"{{\"episode\": {episode}, \"reward\": {episode_reward:.4}, \"steps count\": {episode_duration}, \"threshold\": {eps_threshold:.3}, \"duration\": {}}}",
|
"{{\"episode\": {episode}, \"reward\": {episode_reward:.4}, \"steps count\": {episode_duration}, \"epsilon\": {eps_threshold:.3}, \"goodmoves\": {}, \"gm%\": {:.1}, \"rollpoints\":{}, \"duration\": {}}}",
|
||||||
|
envmut.goodmoves_count,
|
||||||
|
goodmoves_ratio * 100.0,
|
||||||
|
envmut.pointrolls_count,
|
||||||
now.elapsed().unwrap().as_secs(),
|
now.elapsed().unwrap().as_secs(),
|
||||||
);
|
);
|
||||||
|
goodmoves_ratio = envmut.goodmoves_ratio;
|
||||||
|
env.reset();
|
||||||
|
episode_done = true;
|
||||||
now = SystemTime::now();
|
now = SystemTime::now();
|
||||||
} else {
|
} else {
|
||||||
state = *snapshot.state();
|
state = *snapshot.state();
|
||||||
|
|
|
||||||
|
|
@ -86,6 +86,7 @@ pub struct TrictracEnvironment {
|
||||||
pub step_count: usize,
|
pub step_count: usize,
|
||||||
pub min_steps: f32,
|
pub min_steps: f32,
|
||||||
pub max_steps: usize,
|
pub max_steps: usize,
|
||||||
|
pub pointrolls_count: usize,
|
||||||
pub goodmoves_count: usize,
|
pub goodmoves_count: usize,
|
||||||
pub goodmoves_ratio: f32,
|
pub goodmoves_ratio: f32,
|
||||||
pub visualized: bool,
|
pub visualized: bool,
|
||||||
|
|
@ -118,6 +119,7 @@ impl Environment for TrictracEnvironment {
|
||||||
step_count: 0,
|
step_count: 0,
|
||||||
min_steps: 250.0,
|
min_steps: 250.0,
|
||||||
max_steps: 2000,
|
max_steps: 2000,
|
||||||
|
pointrolls_count: 0,
|
||||||
goodmoves_count: 0,
|
goodmoves_count: 0,
|
||||||
goodmoves_ratio: 0.0,
|
goodmoves_ratio: 0.0,
|
||||||
visualized,
|
visualized,
|
||||||
|
|
@ -150,6 +152,7 @@ impl Environment for TrictracEnvironment {
|
||||||
(100.0 * self.goodmoves_ratio).round() as u32
|
(100.0 * self.goodmoves_ratio).round() as u32
|
||||||
);
|
);
|
||||||
self.step_count = 0;
|
self.step_count = 0;
|
||||||
|
self.pointrolls_count = 0;
|
||||||
self.goodmoves_count = 0;
|
self.goodmoves_count = 0;
|
||||||
|
|
||||||
Snapshot::new(self.current_state, 0.0, false)
|
Snapshot::new(self.current_state, 0.0, false)
|
||||||
|
|
@ -162,12 +165,16 @@ impl Environment for TrictracEnvironment {
|
||||||
let trictrac_action = Self::convert_action(action);
|
let trictrac_action = Self::convert_action(action);
|
||||||
|
|
||||||
let mut reward = 0.0;
|
let mut reward = 0.0;
|
||||||
|
let mut is_rollpoint = false;
|
||||||
let mut terminated = false;
|
let mut terminated = false;
|
||||||
|
|
||||||
// Exécuter l'action si c'est le tour de l'agent DQN
|
// Exécuter l'action si c'est le tour de l'agent DQN
|
||||||
if self.game.active_player_id == self.active_player_id {
|
if self.game.active_player_id == self.active_player_id {
|
||||||
if let Some(action) = trictrac_action {
|
if let Some(action) = trictrac_action {
|
||||||
reward = self.execute_action(action);
|
(reward, is_rollpoint) = self.execute_action(action);
|
||||||
|
if is_rollpoint {
|
||||||
|
self.pointrolls_count += 1;
|
||||||
|
}
|
||||||
if reward != Self::ERROR_REWARD {
|
if reward != Self::ERROR_REWARD {
|
||||||
self.goodmoves_count += 1;
|
self.goodmoves_count += 1;
|
||||||
}
|
}
|
||||||
|
|
@ -249,10 +256,11 @@ impl TrictracEnvironment {
|
||||||
// &mut self,
|
// &mut self,
|
||||||
// action: dqn_common::TrictracAction,
|
// action: dqn_common::TrictracAction,
|
||||||
// ) -> Result<f32, Box<dyn std::error::Error>> {
|
// ) -> Result<f32, Box<dyn std::error::Error>> {
|
||||||
fn execute_action(&mut self, action: dqn_common::TrictracAction) -> f32 {
|
fn execute_action(&mut self, action: dqn_common::TrictracAction) -> (f32, bool) {
|
||||||
use dqn_common::TrictracAction;
|
use dqn_common::TrictracAction;
|
||||||
|
|
||||||
let mut reward = 0.0;
|
let mut reward = 0.0;
|
||||||
|
let mut is_rollpoint = false;
|
||||||
|
|
||||||
let event = match action {
|
let event = match action {
|
||||||
TrictracAction::Roll => {
|
TrictracAction::Roll => {
|
||||||
|
|
@ -330,7 +338,8 @@ impl TrictracEnvironment {
|
||||||
let (points, adv_points) = self.game.dice_points;
|
let (points, adv_points) = self.game.dice_points;
|
||||||
reward += Self::REWARD_RATIO * (points - adv_points) as f32;
|
reward += Self::REWARD_RATIO * (points - adv_points) as f32;
|
||||||
if points > 0 {
|
if points > 0 {
|
||||||
println!("info: rolled for {reward}");
|
is_rollpoint = true;
|
||||||
|
// println!("info: rolled for {reward}");
|
||||||
}
|
}
|
||||||
// Récompense proportionnelle aux points
|
// Récompense proportionnelle aux points
|
||||||
}
|
}
|
||||||
|
|
@ -343,7 +352,7 @@ impl TrictracEnvironment {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
reward
|
(reward, is_rollpoint)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Fait jouer l'adversaire avec une stratégie simple
|
/// Fait jouer l'adversaire avec une stratégie simple
|
||||||
|
|
|
||||||
|
|
@ -14,24 +14,25 @@ fn main() {
|
||||||
|
|
||||||
// See also MEMORY_SIZE in dqn_model.rs : 8192
|
// See also MEMORY_SIZE in dqn_model.rs : 8192
|
||||||
let conf = dqn_model::DqnConfig {
|
let conf = dqn_model::DqnConfig {
|
||||||
num_episodes: 40,
|
// defaults
|
||||||
min_steps: 250.0, // min steps by episode (mise à jour par la fonction)
|
num_episodes: 40, // 40
|
||||||
max_steps: 2000, // max steps by episode
|
min_steps: 500.0, // 1000 min of max steps by episode (mise à jour par la fonction)
|
||||||
dense_size: 256, // neural network complexity
|
max_steps: 3000, // 1000 max steps by episode
|
||||||
eps_start: 0.9, // epsilon initial value (0.9 => more exploration)
|
dense_size: 256, // 128 neural network complexity (default 128)
|
||||||
eps_end: 0.05,
|
eps_start: 0.9, // 0.9 epsilon initial value (0.9 => more exploration)
|
||||||
|
eps_end: 0.05, // 0.05
|
||||||
// eps_decay higher = epsilon decrease slower
|
// eps_decay higher = epsilon decrease slower
|
||||||
// used in : epsilon = eps_end + (eps_start - eps_end) * e^(-step / eps_decay);
|
// used in : epsilon = eps_end + (eps_start - eps_end) * e^(-step / eps_decay);
|
||||||
// epsilon is updated at the start of each episode
|
// epsilon is updated at the start of each episode
|
||||||
eps_decay: 3000.0,
|
eps_decay: 2000.0, // 1000 ?
|
||||||
|
|
||||||
gamma: 0.999, // discount factor. Plus élevé = encourage stratégies à long terme
|
gamma: 0.999, // 0.999 discount factor. Plus élevé = encourage stratégies à long terme
|
||||||
tau: 0.005, // soft update rate. Taux de mise à jour du réseau cible. Plus bas = adaptation
|
tau: 0.005, // 0.005 soft update rate. Taux de mise à jour du réseau cible. Plus bas = adaptation
|
||||||
// plus lente moins sensible aux coups de chance
|
// plus lente moins sensible aux coups de chance
|
||||||
learning_rate: 0.001, // taille du pas. Bas : plus lent, haut : risque de ne jamais
|
learning_rate: 0.001, // 0.001 taille du pas. Bas : plus lent, haut : risque de ne jamais
|
||||||
// converger
|
// converger
|
||||||
batch_size: 32, // nombre d'expériences passées sur lesquelles pour calcul de l'erreur moy.
|
batch_size: 32, // 32 nombre d'expériences passées sur lesquelles pour calcul de l'erreur moy.
|
||||||
clip_grad: 100.0, // plafonnement du gradient : limite max de correction à apporter
|
clip_grad: 100.0, // 100 limite max de correction à apporter au gradient (default 100)
|
||||||
};
|
};
|
||||||
println!("{conf}----------");
|
println!("{conf}----------");
|
||||||
let agent = dqn_model::run::<Env, Backend>(&conf, false); //true);
|
let agent = dqn_model::run::<Env, Backend>(&conf, false); //true);
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue