Commit
·
44f990d
1
Parent(s):
9149d87
Update sbatch_8b7_178b_25b_jz_tmp.sh
Browse files
sbatch_8b7_178b_25b_jz_tmp.sh
CHANGED
|
@@ -28,7 +28,10 @@ source $six_ALL_CCFRWORK/start-tr13f-6B3-ml-t0
|
|
| 28 |
GPUS_PER_NODE=8
|
| 29 |
NNODES=$SLURM_NNODES
|
| 30 |
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
|
| 34 |
|
|
@@ -171,7 +174,24 @@ echo $CMD
|
|
| 171 |
|
| 172 |
echo "START $SLURM_JOBID: $(date)"
|
| 173 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
# bash launch_srun.sh $CMD
|
| 175 |
-
srun --label launch.sh $CMD
|
| 176 |
|
| 177 |
echo "END $SLURM_JOBID: $(date)"
|
|
|
|
| 28 |
GPUS_PER_NODE=8
|
| 29 |
NNODES=$SLURM_NNODES
|
| 30 |
|
| 31 |
+
TRAIN_DATA_PATH=train55boscar.txt
|
| 32 |
+
# "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_55B_text_document"
|
| 33 |
+
VALID_DATA_PATH=val.txt
|
| 34 |
+
# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document"
|
| 35 |
|
| 36 |
|
| 37 |
|
|
|
|
| 174 |
|
| 175 |
echo "START $SLURM_JOBID: $(date)"
|
| 176 |
|
| 177 |
+
|
| 178 |
+
### JZ ###
|
| 179 |
+
export LAUNCHER="python -u -m torch.distributed.run \
|
| 180 |
+
--nproc_per_node $GPUS_PER_NODE \
|
| 181 |
+
--nnodes $NNODES \
|
| 182 |
+
--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
|
| 183 |
+
--rdzv_backend c10d \
|
| 184 |
+
--max_restarts 0 \
|
| 185 |
+
--tee 3 \
|
| 186 |
+
"
|
| 187 |
+
SRUN_ARGS=" \
|
| 188 |
+
--wait=60 \
|
| 189 |
+
--kill-on-bad-exit=1 \
|
| 190 |
+
"
|
| 191 |
+
srun $SRUN_ARGS --jobid $SLURM_JOBID bash -c "$LAUNCHER --node_rank \$SLURM_PROCID $CMD" 2>&1 | tee -a $LOGS_PATH/main_log.txt
|
| 192 |
+
|
| 193 |
+
### LUMI ###
|
| 194 |
# bash launch_srun.sh $CMD
|
| 195 |
+
# srun --label launch.sh $CMD
|
| 196 |
|
| 197 |
echo "END $SLURM_JOBID: $(date)"
|