wip stackoverflow debug
This commit is contained in:
parent
3e1775428d
commit
fd269b491d
|
|
@ -9,6 +9,7 @@ use burn::tensor::Tensor;
|
||||||
use burn_rl::agent::DQN;
|
use burn_rl::agent::DQN;
|
||||||
use burn_rl::agent::{DQNModel, DQNTrainingConfig};
|
use burn_rl::agent::{DQNModel, DQNTrainingConfig};
|
||||||
use burn_rl::base::{Action, Agent, ElemType, Environment, Memory, Model, State};
|
use burn_rl::base::{Action, Agent, ElemType, Environment, Memory, Model, State};
|
||||||
|
use std::time::{Duration, SystemTime};
|
||||||
|
|
||||||
#[derive(Module, Debug)]
|
#[derive(Module, Debug)]
|
||||||
pub struct Net<B: Backend> {
|
pub struct Net<B: Backend> {
|
||||||
|
|
@ -99,6 +100,7 @@ pub fn run<E: Environment, B: AutodiffBackend>(
|
||||||
let mut episode_reward: ElemType = 0.0;
|
let mut episode_reward: ElemType = 0.0;
|
||||||
let mut episode_duration = 0_usize;
|
let mut episode_duration = 0_usize;
|
||||||
let mut state = env.state();
|
let mut state = env.state();
|
||||||
|
let mut now = SystemTime::now();
|
||||||
|
|
||||||
while !episode_done {
|
while !episode_done {
|
||||||
let eps_threshold =
|
let eps_threshold =
|
||||||
|
|
@ -131,9 +133,13 @@ pub fn run<E: Environment, B: AutodiffBackend>(
|
||||||
episode_done = true;
|
episode_done = true;
|
||||||
|
|
||||||
println!(
|
println!(
|
||||||
"{{\"episode\": {}, \"reward\": {:.4}, \"duration\": {}}}",
|
"{{\"episode\": {}, \"reward\": {:.4}, \"steps count\": {}, \"duration\": {}}}",
|
||||||
episode, episode_reward, episode_duration
|
episode,
|
||||||
|
episode_reward,
|
||||||
|
episode_duration,
|
||||||
|
now.elapsed().unwrap().as_secs()
|
||||||
);
|
);
|
||||||
|
now = SystemTime::now();
|
||||||
} else {
|
} else {
|
||||||
state = *snapshot.state();
|
state = *snapshot.state();
|
||||||
}
|
}
|
||||||
|
|
|
||||||
12
devenv.lock
12
devenv.lock
|
|
@ -3,10 +3,10 @@
|
||||||
"devenv": {
|
"devenv": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"dir": "src/modules",
|
"dir": "src/modules",
|
||||||
"lastModified": 1747717470,
|
"lastModified": 1753667201,
|
||||||
"owner": "cachix",
|
"owner": "cachix",
|
||||||
"repo": "devenv",
|
"repo": "devenv",
|
||||||
"rev": "c7f2256ee4a4a4ee9cbf1e82a6e49b253c374995",
|
"rev": "4d584d7686a50387f975879788043e55af9f0ad4",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
@ -40,10 +40,10 @@
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1747372754,
|
"lastModified": 1750779888,
|
||||||
"owner": "cachix",
|
"owner": "cachix",
|
||||||
"repo": "git-hooks.nix",
|
"repo": "git-hooks.nix",
|
||||||
"rev": "80479b6ec16fefd9c1db3ea13aeb038c60530f46",
|
"rev": "16ec914f6fb6f599ce988427d9d94efddf25fe6d",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
@ -74,10 +74,10 @@
|
||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1747958103,
|
"lastModified": 1753432016,
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "fe51d34885f7b5e3e7b59572796e1bcb427eccb1",
|
"rev": "6027c30c8e9810896b92429f0092f624f7b1aace",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,21 @@
|
||||||
|
|
||||||
## TODO
|
## TODO
|
||||||
|
|
||||||
|
### stack overflow
|
||||||
|
|
||||||
|
- <https://crates.io/crates/backtrace-on-stack-overflow>
|
||||||
|
- <https://users.rust-lang.org/t/how-to-diagnose-a-stack-overflow-issues-cause/17320/11>
|
||||||
|
- <https://www.reddit.com/r/rust/comments/1d8lxtd/debugging_stack_overflows/>
|
||||||
|
|
||||||
|
Méthodes pour limiter la stack : réduire la taille de la pile avant de lancer ton binaire en ligne de commande :
|
||||||
|
|
||||||
|
```sh
|
||||||
|
ulimit -s 6144 # Limite la pile à 6Mo
|
||||||
|
# just trainbot
|
||||||
|
RUST_BACKTRACE=1 LD_LIBRARY_PATH=./target/debug ./target/debug/train_dqn_burn
|
||||||
|
ulimit -s unlimited # Pour revenir à la normale
|
||||||
|
```
|
||||||
|
|
||||||
- bot burn
|
- bot burn
|
||||||
- train = `just trainbot`
|
- train = `just trainbot`
|
||||||
- durée d'entrainement selon params ?
|
- durée d'entrainement selon params ?
|
||||||
|
|
|
||||||
|
|
@ -43,4 +43,4 @@ fatal runtime error: stack overflow
|
||||||
error: Recipe `trainbot` was terminated on line 25 by signal 6
|
error: Recipe `trainbot` was terminated on line 25 by signal 6
|
||||||
```
|
```
|
||||||
|
|
||||||
Au bout du 12ème épisode (plus de 6 heures sur ma machine), l'entraînement s'arrête avec une erreur stack overlow. Peux-tu m'aider à diagnostiquer d'où peut provenir le problème ? Y a-t-il des outils qui permettent de détecter les zones de code qui utilisent le plus la stack ? Pour information j'ai vu ce rapport de bug https://github.com/yunjhongwu/burn-rl-examples/issues/40, donc peut-être que le problème vient du paquet 'burl-rl'.
|
Au bout du 12ème épisode (plus de 6 heures sur ma machine), l'entraînement s'arrête avec une erreur stack overlow. Peux-tu m'aider à diagnostiquer d'où peut provenir le problème ? Y a-t-il des outils qui permettent de détecter les zones de code qui utilisent le plus la stack ? Pour information j'ai vu ce rapport de bug <https://github.com/yunjhongwu/burn-rl-examples/issues/40> , donc peut-être que le problème vient du paquet 'burl-rl'.
|
||||||
|
|
|
||||||
7
justfile
7
justfile
|
|
@ -21,3 +21,10 @@ trainbot:
|
||||||
#python ./store/python/trainModel.py
|
#python ./store/python/trainModel.py
|
||||||
# cargo run --bin=train_dqn # ok
|
# cargo run --bin=train_dqn # ok
|
||||||
cargo run --bin=train_dqn_burn
|
cargo run --bin=train_dqn_burn
|
||||||
|
debugtrainbot:
|
||||||
|
cargo build --bin=train_dqn_burn
|
||||||
|
RUST_BACKTRACE=1 LD_LIBRARY_PATH=./target/debug ./target/debug/train_dqn_burn
|
||||||
|
profiletrainbot:
|
||||||
|
echo '1' | sudo tee /proc/sys/kernel/perf_event_paranoid
|
||||||
|
cargo build --profile profiling --bin=train_dqn_burn
|
||||||
|
LD_LIBRARY_PATH=./target/debug samply record ./target/profiling/train_dqn_burn
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue