fix indexing

ashors1 · ashors1 · commit e46aa89bba76 · 2025-05-15T13:46:51.000-07:00
Signed-off-by: ashors1 &lt;ashors@nvidia.com&gt;
diff --git a/nemo_rl/algorithms/dpo.py b/nemo_rl/algorithms/dpo.py
@@ -397,7 +397,7 @@ def dpo_train(
                 )
 
                 is_last_step = total_steps >= master_config["dpo"]["max_num_steps"] or (
-                    current_epoch + 1 == max_num_epochs
+                    current_epoch == max_num_epochs
                     and current_step == len(train_dataloader)
                 )
 
@@ -466,4 +466,4 @@ def dpo_train(
                 return
 
         current_epoch += 1
-        current_step = 0  # Reset step counter for new epoch
+        current_step = 1  # Reset step counter for new epoch
diff --git a/nemo_rl/algorithms/sft.py b/nemo_rl/algorithms/sft.py
@@ -410,7 +410,7 @@ def sft_train(
                 train_results = policy.train(train_data, loss_fn)
 
                 is_last_step = total_steps >= master_config["sft"]["max_num_steps"] or (
-                    current_epoch + 1 == max_num_epochs
+                    current_epoch == max_num_epochs
                     and current_step == len(train_dataloader)
                 )
 
@@ -487,4 +487,4 @@ def sft_train(
                 return
 
         current_epoch += 1
-        current_step = 0  # Reset step counter for new epoch
+        current_step = 1  # Reset step counter for new epoch

Original file line number	Diff line number	Diff line change
`@@ -397,7 +397,7 @@ def dpo_train(`
`397`	`397`	`)`
`398`	`398`
`399`	`399`	`is_last_step = total_steps >= master_config["dpo"]["max_num_steps"] or (`
`400`		`- current_epoch + 1 == max_num_epochs`
	`400`	`+ current_epoch == max_num_epochs`
`401`	`401`	`and current_step == len(train_dataloader)`
`402`	`402`	`)`
`403`	`403`
`@@ -466,4 +466,4 @@ def dpo_train(`
`466`	`466`	`return`
`467`	`467`
`468`	`468`	`current_epoch += 1`
`469`		`- current_step = 0 # Reset step counter for new epoch`
	`469`	`+ current_step = 1 # Reset step counter for new epoch`