| trainer: | |
| default_root_dir: null | |
| default_hdfs_dir: hdfs://haruna/home/byte_data_seed/hdd_hldy/user/yuyu.zhang/seekpath/P61_D6_8B_8M_tp2_stage2_H800_code | |
| logger: | |
| - tracking | |
| - console | |
| log_every_n_steps: 50 | |
| benchmark: false | |
| enable_speedmonitor: true | |
| stats_speedmonitor: false | |
| enable_versions: false | |
| detect_anomaly: false | |
| deterministic: false | |
| accelerator: gpu | |
| accelerator_kwargs: | |
| mega_config: null | |
| precision: bf16 | |
| max_epochs: 1 | |
| max_steps: -1 | |
| limit_train_batches: null | |
| limit_val_batches: null | |
| limit_test_batches: null | |
| static_sync_limit_val: false | |
| sync_batchnorm: false | |
| sync_fit_metrics: null | |
| val_check_interval: | |
| - 20000000 | |
| save_before_val: false | |
| accumulate_grad_batches: null | |
| gradient_clip_val: 1.0 | |
| max_grad_clip: 0.0 | |
| seed: null | |
| summarize_model_depth: 0 | |
| resume_ckpt_path: auto | |
| frozen_ckpt_path: null | |
| resume_strict: true | |
| resume_optimizer: true | |
| resume_metadata: true | |
| resume_loader_state: false | |
| callbacks: null | |
| enable_checkpoint: | |
| - 1 | |
| - 10000 | |
| checkpoint_monitor: step | |
| checkpoint_mode: max | |
| dataloader_timeout: -1 | |
| dataloader_retry_limit: 100 | |
| dataloader_retry_persistent_limit: 5 | |
| find_unused_parameters: false | |
| project_name: seekpath_v3 | |
| experiment_name: P61_D6_8B_npu_8M_tp2_H800stage1_code | |
| enable_trace: false | |
| reload_dataloaders_every_n_epochs: -1 | |
| strategy: megatron | |
| enable_qat: false | |
| no_quant_module: [] | |
| enable_ptq: true | |
| qat_kwargs: {} | |
| optimizer_kwargs: | |
| optimizer: | |
| type: adam | |
| params: | |
| lr: 3.0e-05 | |
| betas: | |
| - 0.9 | |
| - 0.95 | |
| eps: 1.0e-08 | |
| weight_decay: 0.1 | |
| bias_correction: true | |
| adam_w_mode: true | |
| momentum: 0.9 | |
| lr_mult_keys: [] | |
| no_weight_decay_keys: [] | |
| weight_decay_keys: [] | |
| lr_mult_start_epoch: 0 | |
| lr_mult: 1.0 | |
| scheduler: | |
| type: megatron.optimizer_param_schedule.OptimizerParamScheduler | |
| total_steps_param_name: num_training_steps | |
| warmup_steps_param_name: num_warmup_steps | |
| interval: step | |
| params: | |
| warmup_step_rate: 0.002 | |
| lr_end: 0.1 | |
| lr_decay_style: constant | |
| lr_decay_rate: 1.0 | |
| grad_norm_layers: [] | |
| checkpoint_kwargs: | |
| verbose: false | |
| save_last: false | |
| save_weights_only: false | |
| every_n_train_steps: -1 | |
| every_n_seconds: -1 | |
| save_best: false | |
| storage: | |
| enable_shm_download: false | |
| enable_shm_upload: false | |
| download_thread_num: 16 | |
| upload_thread_num: 1 | |
| enable_save_checkpoint_async: true | |
| enable_profiler: false | |
| profiler_schedule_kwargs: | |
| wait: 50 | |
| warmup: 3 | |
| active: 3 | |
| repeat: 1 | |
| profile_all_ranks: false | |
| enable_bsdp: false | |
| bsdp_num_prefetch: 64 | |
| keep_frozen_weights: true | |
| val_reduce_fn: {} | |
| experiment_id: null | |
| enable_omnistore: false | |
| model: | |
| network: | |
| hidden_size: 4096 | |
| n_embed: 4096 | |
| n_inner: 14336 | |
| n_head: 32 | |
| n_layer: 32 | |
| vocab_size: 155136 | |
| max_position_embeddings: 32768 | |
| cross_entropy_spilt_num: 1 | |
| layer_norm_epsilon: 1.0e-05 | |
| activation_function: gelu_new | |
| resid_pdrop: 0.1 | |
| embd_pdrop: 0.0 | |
| attn_pdrop: 0.1 | |
| scale_attn_weights: true | |
| scale_attn_by_inverse_layer_idx: false | |
| reorder_and_upcast_attn: false | |
| initializer_range: 0.009882118 | |
| gradient_checkpointing: false | |
| gradient_checkpointing_ln: false | |
| gradient_checkpointing_mlp: false | |
| gradient_checkpointing_start_layers: 0 | |
| tie_weight: false | |
| pad_idx: 1 | |
| use_ft_flash_attn: false | |
| use_ft_linear: false | |
| use_ft_layernorm: false | |
| use_xperf_rotary: false | |
| use_rmpad: true | |
| fuse_gelu_gemm: false | |
| pad_output: false | |
| position_embeddings_type: rope | |
| skip_n_iters: -1 | |
| n_shared_qhead: 4 | |
| num_q_heads: -1 | |
| num_kv_heads: -1 | |
| head_dim: -1 | |
| kv_mirror_layers: [] | |
| kv_mirror_imitated_layers: [] | |
| residual_post_ln_layers: [] | |
| hyperconnection_rate: -1 | |
| repeat_kv_heads: true | |
| sparse_attention_window_size: | |
| - -1 | |
| use_query_swiglu: false | |
| query_swiglu_inner_dim: 8192 | |
| force_mem_efficient_layers: | |
| - -1 | |
| noop_transformer_layers: [] | |
| dense_ffn_layers: [] | |
| dense_ffn_type: swiglu | |
| dense_ffn_inner_dim: -1 | |
| moe_expert_type: exp-xelego | |
| moe_gate_type: caplog-lego | |
| moe_gate_metric_type: lego | |
| moe_expert_exp_level: 4 | |
| moe_expert_exp_first_dim_factor: 1.0 | |
| moe_expert_exp_first_num: 2 | |
| moe_topk: 5 | |
| moe_num_expert: 0 | |
| moe_expert_eq_dim_factor: 0.25 | |
| moe_backend: default | |
| moe_overlap_recomp_grad_comm: false | |
| moe_expert_op_version: V1 | |
| moe_aux_loss_weight: 0.001 | |
| moe_gate_dropout: 0.0 | |
| moe_use_balance: false | |
| moe_expert_group_capacity: 1.0 | |
| moe_expert_group_balance_loss_weight: 0.0 | |
| moe_expert_groups_in_ep_rank: 1 | |
| moe_enable_warmup: false | |
| moe_swiglu_fc1_2_init_scale: 1.0 | |
| janus_use_big_op: false | |
| janus_big_op_version: V1 | |
| janus_big_op_attn_grad_accum_fusion: true | |
| convert_gate_to_fp32: false | |
| moe_enable_ema_update: 1 | |
| query_head_scale_factor: 1 | |
| value_moe_num_expert: 0 | |
| value_moe_qkv_topk: 4 | |
| value_moe_qkv_times: 1 | |
| value_moe_is_repeat: true | |
| value_moe_expert_type: linear-lego | |
| moe_pr_scale_factor: 1.0 | |
| moe_pr_expert_type: disabled | |
| value_moe_gate_type: default-lego | |
| value_moe_gate_metric_type: default | |
| lora_rank: 0 | |
| save_mixed_ckpt_in_shards: false | |
| save_mixed_model_states_freq: final | |
| cont_train_mode: default | |
| fuse_lora_weight: true | |
| rope_mode: default | |
| rope_scale: 1 | |
| rope_base: 500000.0 | |
| rope_cut: false | |
| rope_cut_head_dim: 0 | |
| rope_force_fp32: false | |
| sparse_attention_window_scale: 1 | |
| sparse_attention_global_window_size: | |
| - 0 | |
| use_attention_bias: false | |
| layer_norm_type: rmsnorm_torch | |
| exact_token_as_loss_denominator: false | |
| use_key_layernorm: false | |
| key_norm_after_rope: false | |
| use_query_layernorm: false | |
| use_context_groupnorm: false | |
| use_mariana_gqa_pattern: false | |
| use_sequence_parallel_attention: false | |
| use_sequence_parallel_attention_a2a: false | |
| context_parallel_use_all_gather: false | |
| fp8_use_bf16_layers: '' | |
| deterministic_mode: false | |
| megatron_tensor_parallel_size: 8 | |
| megatron_pipeline_parallel_size: 1 | |
| megatron_context_parallel_size: 1 | |
| megatron_expert_parallel_size: 1 | |
| megatron_expert_parallel_size_in_dp: 1 | |
| megatron_context_parallel_query_only: false | |
| megatron_num_layers_per_virtual_pipeline_stage: 0 | |
| megatron_micro_batch_size: 1 | |
| megatron_global_batch_size: 256 | |
| megatron_sequence_parallel: true | |
| megatron_recompute_granularity: '' | |
| megatron_use_flash_attention: true | |
| megatron_recompute_method: uniform | |
| megatron_recompute_num_layers: 1 | |
| megatron_distribute_saved_activations: false | |
| megatron_enable_distributed_optimizer: true | |
| megatron_use_multi_precision_ddp: false | |
| megatron_sequence_parallel_as_data_parallel_in_optimizer: false | |
| megatron_gather_params_use_alltoall: false | |
| megatron_enable_initial_jit_warmup: true | |
| megatron_accumulate_allreduce_grads_in_fp32: true | |
| megatron_bf16_use_bf16_allreduce_grads: false | |
| megatron_grad_comm_type: '' | |
| megatron_reduce_grads_use_alltoall: false | |
| megatron_scale_loss_in_gradient: false | |
| megatron_scale_gradient_after_allreduce: false | |
| megatron_ddp_impl: local | |
| megatron_bf16_qt: false | |
| megatron_empty_cache_level: 0 | |
| megatron_force_fp32_embed: false | |
| megatron_deterministic_flash_attn: false | |
| megatron_switch_pp_and_dp: false | |
| megatron_timing_log_level: 2 | |
| megatron_no_load_rng: false | |
| megatron_no_save_rng: false | |
| megatron_no_load_optim: false | |
| megatron_mem_efficient_column_parallel: true | |
| megatron_masked_softmax_fusion: true | |
| megatron_bias_gelu_fusion: false | |
| megatron_bias_dropout_fusion: false | |
| megatron_gradient_accumulation_fusion: true | |
| megatron_overlap_p2p_comm: false | |
| megatron_deallocate_pipeline_outputs: true | |
| megatron_timing_log_option: local | |
| megatron_barrier_with_L1_time: false | |
| megatron_strict_align_diff_with_ds: false | |
| megatron_parallel_linear_force_weight_contiguous: false | |
| megatron_use_mariana_softmax: false | |
| megatron_use_mariana_activation: false | |
| megatron_overlap_data_parallel_communication: false | |
| megatron_overlap_dp_grad_comm: false | |
| megatron_overlap_dp_param_comm: false | |
| megatron_early_prefetch_dp_allgather: true | |
| megatron_use_non_sequential_block: false | |
| megatron_overlap_attn_grad_input_comm: true | |
| megatron_sequence_data_parallel_size: -1 | |
| megatron_distributed_sequence_parallel_size: -1 | |
| megatron_num_layers_for_pipeline_stages: [] | |
| megatron_vocab_parallel_embedding_fusion: false | |
| megatron_embedding_reduce_scatter_for_sp: true | |
| megatron_print_args: true | |
| megatron_grad_norm_skip: -1.0 | |
| megatron_reorder_wgrad: false | |
| megatron_offload_activations: false | |
| megatron_offload_ratio: 1.0 | |
| megatron_offload_launch_ratio: 1.0 | |
| megatron_optimizer_offload_main_param: false | |
| megatron_data_parallel_random_init: false | |
| megatron_pipeline_strategy: '' | |
| megatron_pipeline_wgrad_strategy: '' | |
| megatron_pipeline_warmup_overlap: false | |
| megatron_allow_transformer_engine: false | |
| megatron_fp8_e4m3: false | |
| megatron_fp8_hybrid: false | |
| megatron_fp8_wgrad: true | |
| megatron_fp8_dgrad: true | |
| megatron_fp8_margin: 0 | |
| megatron_fp8_interval: 1 | |
| megatron_transformer_impl: local | |
| megatron_fp8_amax_history_len: 1024 | |
| megatron_fp8_amax_compute_algo: max | |
| megatron_use_qlora: false | |
| megatron_qlora_quant_weight_dtype: null | |
| megatron_qlora_quant_real_store: false | |
| megatron_qlora_quant_groupsize: -1 | |
| megatron_qlora_quant_input_dtype: '' | |
| megatron_qlora_quant_aware_lora: false | |
| megatron_qlora_quant_aware_L4Q: false | |
| megatron_terapipe_nano_batch_size: -1 | |
| lora_config: | |
| default: | |
| lora_dropout: 0.0 | |
| lora_rank: 64 | |
| layers: | |
| - all | |
| init_method: normal | |
| init_mode: nonzero_parallel_init | |
| init_kwargs: {} | |
| lora_alpha: 2.0 | |
| use_rslora: true | |
| lora_experts_appr: full | |
| use_qlora: false | |
| qlora_quant_weight_dtype: null | |
| qlora_quant_real_store: false | |
| qlora_quant_aware_L4Q: false | |
| qlora_quant_groupsize: -1 | |
| qlora_quant_input_dtype: None | |
| qlora_quant_aware_lora: false | |
| post_training_quant: false | |
| fully_sharded: false | |
| emb_trainable: true | |
| target_modules: | |
| - query_key_value | |
| - experts | |
| - dense | |
| query_key_value: | |
| lora_rank: -1 | |
| lora_alpha: -1.0 | |
| experts: | |
| lora_rank: -1 | |
| lora_alpha: -1.0 | |
| dense: | |
| lora_rank: -1 | |
| lora_alpha: -1.0 | |
| dense_h_to_4h: | |
| lora_rank: -1 | |
| lora_alpha: -1.0 | |
| dense_4h_to_h: | |
| lora_rank: -1 | |
| lora_alpha: -1.0 | |
| freeze_prefix: null | |
| partial_pretrain: null | |
| partial_pretrain_rename: null | |
| reset_global_step: -1 | |
| override_lr_scheduler: true | |
| start_debug_server: false | |
| clip_token_ids: false | |
| data: | |
| train_path: hdfs://haruna/home/byte_data_seed/hdd_hldy/seed_code_seekpath/pretrained_yaml_new/V1_longct_datacard_hdfs_new_stage2_code_ct_fim_2.yaml | |
| val_path: | |
| - hdfs://haruna/home/byte_data_seed/ssd_hldy/p0/D73_release/val/D73_val_20240507_2_200M_token_plain_source_v2_1_part | |
| - hdfs://haruna/home/byte_data_seed/ssd_hldy/p0/D73_release/val/human_all_lite | |
| - hdfs://haruna/home/byte_data_seed/ssd_hldy/p0/D73_release/val/autoeval_code_val_lite | |
| - hdfs://haruna/home/byte_data_seed/ssd_hldy/p0/D73_release/val/pretrain_auto_eval_merged_all_20240412_ceval_1_part | |
| - hdfs://haruna/home/byte_data_seed/ssd_hldy/p0/D73_release/val/merged_few_benchmark_datasets_20240705_1_part | |
| - hdfs://haruna/home/byte_data_seed/ssd_hldy/p0/D73_release/val/pretrain_auto_eval_merged_all_v0.3_1_part | |
| - hdfs://haruna/home/byte_data_seed/ssd_hldy/p0/D73_release/val/D74_val_20240621_200M_token_tok643_sa8192_plain_source_v2_1_part_dir | |
| train_size: 5000000000000 | |
| val_size: -1 | |
| train_batch_size: 32 | |
| train_num_workers: 4 | |
| val_batch_size: -1 | |
| val_num_workers: 1 | |
| max_seq_len: 32768 | |
| val_max_seq_len: -1 | |
| text_keys: | |
| - content_split | |
| tokenizer: hdfs://haruna/home/byte_data_seed/hl_lq/seed_code/liuyongfei/tokenizers/bbpe155k-v6.4.3-ml.pret | |
| gpu_prefetch: false | |
| cpu_prefetch: false | |
| dyn_bsz: true | |
| dyn_bsz_margin: 0.0 | |
| stride: -1 | |
| warmup_step_rate: -1.0 | |
| tokenizer_type: bbpe | |
| bsz_warmup: false | |
| bsz_warmup_rate: 0.016 | |
| return_source: true | |
| synthetic_sample: false | |
| synthetic_batch: false | |
| seq_lens: null | |
| seq_probs: null | |
| enable_sampling_ratios: false | |
| train_path_with_ratio: null | |
| src_weights: null | |
| parse_aug_data: false | |
| loader_accumulate: -1 | |
| bsz_warmup_warmup_step_rate: 0.002 | |
| max_epochs: 1 | |
| pad_idx: 1 | |
| strategy: megatron | |
| megatron_micro_batch_size: 1 | |
| use_rmpad: true | |
| hidden_size: -1 | |
| megatron_sequence_parallel: false | |
| max_position_embeddings: 2048 | |
| position_embeddings_type: absolute | |
| use_sequence_parallel_attention: false | |
| use_sequence_parallel_attention_a2a: false | |
| resume_ckpt_path: '' | |
| val_override_est_steps: false | |
| init_without_cli: true | |
| rope_mode: default | |
| rope_scale: 1 | |
| rope_base: 500000.0 | |
| rope_cut: false | |
| rope_cut_head_dim: 0 | |
| init_val_loader_worker_beforehand: false | |
| megatron_global_batch_size: 1 | |
| megatron_tensor_parallel_size: 1 | |
| megatron_pipeline_parallel_size: 1 | |
| n_head: 1 | |
| log_level: INFO | |
| val_only: false | |
| merge_model_states: false | |
| merge_ckpt_dtype: bf16 | |
| merge_cache_dir: ./ | |
| download_ckpt_in_shards: true | |
| gc_interval: 50 | |
| profiler_at_iter: -1 | |
| timer_at_iter: -1 | |
| profile_all_ranks: false | |
| profile_ranks: [] | |
| profile_every_n_steps: -1 | |
| profiler_memory_at_iter: null | |
| profile_max_preview_rank: 0 | |