| 
							 | 
						#!/bin/bash | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						VARIANT=8b7178b25bopt | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						if [ -z $SLURM_JOB_ID ]; then | 
					
					
						
						| 
							 | 
						    mkdir -p logs | 
					
					
						
						| 
							 | 
						    sbatch "$0" | 
					
					
						
						| 
							 | 
						    exit | 
					
					
						
						| 
							 | 
						fi | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						set -euo pipefail | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						ln -f -s $SLURM_JOB_ID.out logs/latest.out | 
					
					
						
						| 
							 | 
						ln -f -s $SLURM_JOB_ID.err logs/latest.err | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						KILL_SWITCH_PATH=kill-switch-$VARIANT | 
					
					
						
						| 
							 | 
						CHECKPOINT_PATH=checkpoints_$VARIANT | 
					
					
						
						| 
							 | 
						TENSORBOARD_PATH=tensorboard_$VARIANT | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						VOCAB_FILE="gpt2/vocab.json" | 
					
					
						
						| 
							 | 
						MERGE_FILE="gpt2/merges.txt" | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						TRAIN_DATA_PATH=train25b.txt | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						VALID_DATA_PATH=val.txt | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						PP_SIZE=4 | 
					
					
						
						| 
							 | 
						TP_SIZE=4 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						MICRO_BATCH_SIZE=1 | 
					
					
						
						| 
							 | 
						GRADIENT_ACCUMULATION_STEPS=4 | 
					
					
						
						| 
							 | 
						WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) | 
					
					
						
						| 
							 | 
						GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						source model_params.sh | 
					
					
						
						| 
							 | 
						MODEL_PARAM=("${PARAM_6796M[@]}") | 
					
					
						
						| 
							 | 
						NHIDDEN=${MODEL_PARAM[0]} | 
					
					
						
						| 
							 | 
						FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} | 
					
					
						
						| 
							 | 
						KV_SIZE=${MODEL_PARAM[2]} | 
					
					
						
						| 
							 | 
						NHEADS=${MODEL_PARAM[3]} | 
					
					
						
						| 
							 | 
						NLAYERS=${MODEL_PARAM[4]} | 
					
					
						
						| 
							 | 
						SEQ_LEN=2048 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						SAVE_INTERVAL=5000 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						TRAIN_SAMPLES=117_910_413 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						OPTIMIZER_ARGS=" \ | 
					
					
						
						| 
							 | 
						    --optimizer adam \ | 
					
					
						
						| 
							 | 
						    --adam-beta1 0.9 \ | 
					
					
						
						| 
							 | 
						    --adam-beta2 0.999 \ | 
					
					
						
						| 
							 | 
						    --adam-eps 1e-8 \ | 
					
					
						
						| 
							 | 
						    --lr 2e-4 \ | 
					
					
						
						| 
							 | 
						    --min-lr 2e-5 \ | 
					
					
						
						| 
							 | 
						    --lr-decay-style cosine \ | 
					
					
						
						| 
							 | 
						    --lr-decay-samples $TRAIN_SAMPLES \ | 
					
					
						
						| 
							 | 
						    --lr-warmup-samples 1_179_104 \ | 
					
					
						
						| 
							 | 
						    --clip-grad 1.0 \ | 
					
					
						
						| 
							 | 
						    --weight-decay 1e-1 \ | 
					
					
						
						| 
							 | 
						    " | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						GPT_ARGS=" \ | 
					
					
						
						| 
							 | 
						    --num-layers $NLAYERS \ | 
					
					
						
						| 
							 | 
						    --hidden-size $NHIDDEN \ | 
					
					
						
						| 
							 | 
						    --num-attention-heads $NHEADS \ | 
					
					
						
						| 
							 | 
						    --kv-channels $KV_SIZE \ | 
					
					
						
						| 
							 | 
						    --ffn-hidden-size $FFN_HIDDEN_SIZE \ | 
					
					
						
						| 
							 | 
						    --seq-length $SEQ_LEN \ | 
					
					
						
						| 
							 | 
						    --max-position-embeddings $SEQ_LEN \ | 
					
					
						
						| 
							 | 
						    --micro-batch-size $MICRO_BATCH_SIZE \ | 
					
					
						
						| 
							 | 
						    --global-batch-size $GLOBAL_BATCH_SIZE \ | 
					
					
						
						| 
							 | 
						    --train-samples $TRAIN_SAMPLES \ | 
					
					
						
						| 
							 | 
						    --vocab-file $VOCAB_FILE \ | 
					
					
						
						| 
							 | 
						    --merge-file $MERGE_FILE \ | 
					
					
						
						| 
							 | 
						    --clip-grad 1.0 \ | 
					
					
						
						| 
							 | 
						    --kill-switch-path $KILL_SWITCH_PATH \ | 
					
					
						
						| 
							 | 
						    --bf16 \ | 
					
					
						
						| 
							 | 
						    $OPTIMIZER_ARGS \ | 
					
					
						
						| 
							 | 
						    " | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						OUTPUT_ARGS=" \ | 
					
					
						
						| 
							 | 
						    --log-interval 10 \ | 
					
					
						
						| 
							 | 
						    --save-interval $SAVE_INTERVAL \ | 
					
					
						
						| 
							 | 
						    --eval-interval 1000 \ | 
					
					
						
						| 
							 | 
						    --eval-iters 1 \ | 
					
					
						
						| 
							 | 
						    --tensorboard-dir $TENSORBOARD_PATH \ | 
					
					
						
						| 
							 | 
						    --tensorboard-queue-size 5 \ | 
					
					
						
						| 
							 | 
						    --log-timers-to-tensorboard \ | 
					
					
						
						| 
							 | 
						    --log-batch-size-to-tensorboard \ | 
					
					
						
						| 
							 | 
						    --log-validation-ppl-to-tensorboard \ | 
					
					
						
						| 
							 | 
						    " | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						ZERO_STAGE=0 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						mkdir -p ds_configs | 
					
					
						
						| 
							 | 
						DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						cat <<EOF > $DS_CONFIG_PATH | 
					
					
						
						| 
							 | 
						{ | 
					
					
						
						| 
							 | 
						    "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, | 
					
					
						
						| 
							 | 
						    "train_batch_size": $GLOBAL_BATCH_SIZE, | 
					
					
						
						| 
							 | 
						    "gradient_clipping": 1.0, | 
					
					
						
						| 
							 | 
						    "zero_optimization": { | 
					
					
						
						| 
							 | 
						        "stage": $ZERO_STAGE | 
					
					
						
						| 
							 | 
						    }, | 
					
					
						
						| 
							 | 
						    "bf16": { | 
					
					
						
						| 
							 | 
						        "enabled": true | 
					
					
						
						| 
							 | 
						    }, | 
					
					
						
						| 
							 | 
						    "steps_per_print": 2000, | 
					
					
						
						| 
							 | 
						    "wall_clock_breakdown": false | 
					
					
						
						| 
							 | 
						} | 
					
					
						
						| 
							 | 
						EOF | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						DEEPSPEED_ARGS=" \ | 
					
					
						
						| 
							 | 
						    --deepspeed \ | 
					
					
						
						| 
							 | 
						    --deepspeed_config $DS_CONFIG_PATH \ | 
					
					
						
						| 
							 | 
						    --zero-stage $ZERO_STAGE \ | 
					
					
						
						| 
							 | 
						    " | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						CMD=" \ | 
					
					
						
						| 
							 | 
						    Megatron-DeepSpeed/pretrain_gpt.py \ | 
					
					
						
						| 
							 | 
						    --tensor-model-parallel-size $TP_SIZE \ | 
					
					
						
						| 
							 | 
						    --pipeline-model-parallel-size $PP_SIZE \ | 
					
					
						
						| 
							 | 
						    $GPT_ARGS \ | 
					
					
						
						| 
							 | 
						    $OUTPUT_ARGS \ | 
					
					
						
						| 
							 | 
						    --save $CHECKPOINT_PATH \ | 
					
					
						
						| 
							 | 
						    --load $CHECKPOINT_PATH \ | 
					
					
						
						| 
							 | 
						    --train-weighted-split-paths-path $TRAIN_DATA_PATH \ | 
					
					
						
						| 
							 | 
						    --valid-weighted-split-paths-path $VALID_DATA_PATH \ | 
					
					
						
						| 
							 | 
						    --data-impl mmap \ | 
					
					
						
						| 
							 | 
						     $DEEPSPEED_ARGS \ | 
					
					
						
						| 
							 | 
						    " | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						echo $CMD | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						echo "START $SLURM_JOBID: $(date)" | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						srun --label launch.sh $CMD | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						echo "END $SLURM_JOBID: $(date)" | 
					
					
						
						| 
							 | 
						
 |