ByteDance-Seed
/

Seed-Coder-8B-Base

@@ -1,444 +0,0 @@
-trainer:
-  default_root_dir: null
-  default_hdfs_dir: hdfs://haruna/home/byte_data_seed/hdd_hldy/user/yuyu.zhang/seekpath/P61_D6_8B_8M_tp2_stage2_H800_code
-  logger:
-  - tracking
-  - console
-  log_every_n_steps: 50
-  benchmark: false
-  enable_speedmonitor: true
-  stats_speedmonitor: false
-  enable_versions: false
-  detect_anomaly: false
-  deterministic: false
-  accelerator: gpu
-  accelerator_kwargs:
-    mega_config: null
-  precision: bf16
-  max_epochs: 1
-  max_steps: -1
-  limit_train_batches: null
-  limit_val_batches: null
-  limit_test_batches: null
-  static_sync_limit_val: false
-  sync_batchnorm: false
-  sync_fit_metrics: null
-  val_check_interval:
-  - 20000000
-  save_before_val: false
-  accumulate_grad_batches: null
-  gradient_clip_val: 1.0
-  max_grad_clip: 0.0
-  seed: null
-  summarize_model_depth: 0
-  resume_ckpt_path: auto
-  frozen_ckpt_path: null
-  resume_strict: true
-  resume_optimizer: true
-  resume_metadata: true
-  resume_loader_state: false
-  callbacks: null
-  enable_checkpoint:
-  - 1
-  - 10000
-  checkpoint_monitor: step
-  checkpoint_mode: max
-  dataloader_timeout: -1
-  dataloader_retry_limit: 100
-  dataloader_retry_persistent_limit: 5
-  find_unused_parameters: false
-  project_name: seekpath_v3
-  experiment_name: P61_D6_8B_npu_8M_tp2_H800stage1_code
-  enable_trace: false
-  reload_dataloaders_every_n_epochs: -1
-  strategy: megatron
-  enable_qat: false
-  no_quant_module: []
-  enable_ptq: true
-  qat_kwargs: {}
-  optimizer_kwargs:
-    optimizer:
-      type: adam
-      params:
-        lr: 3.0e-05
-        betas:
-        - 0.9
-        - 0.95
-        eps: 1.0e-08
-        weight_decay: 0.1
-        bias_correction: true
-        adam_w_mode: true
-        momentum: 0.9
-        lr_mult_keys: []
-        no_weight_decay_keys: []
-        weight_decay_keys: []
-        lr_mult_start_epoch: 0
-        lr_mult: 1.0
-    scheduler:
-      type: megatron.optimizer_param_schedule.OptimizerParamScheduler
-      total_steps_param_name: num_training_steps
-      warmup_steps_param_name: num_warmup_steps
-      interval: step
-      params:
-        warmup_step_rate: 0.002
-        lr_end: 0.1
-        lr_decay_style: constant
-        lr_decay_rate: 1.0
-  grad_norm_layers: []
-  checkpoint_kwargs:
-    verbose: false
-    save_last: false
-    save_weights_only: false
-    every_n_train_steps: -1
-    every_n_seconds: -1
-    save_best: false
-    storage:
-      enable_shm_download: false
-      enable_shm_upload: false
-      download_thread_num: 16
-      upload_thread_num: 1
-  enable_save_checkpoint_async: true
-  enable_profiler: false
-  profiler_schedule_kwargs:
-    wait: 50
-    warmup: 3
-    active: 3
-    repeat: 1
-  profile_all_ranks: false
-  enable_bsdp: false
-  bsdp_num_prefetch: 64
-  keep_frozen_weights: true
-  val_reduce_fn: {}
-  experiment_id: null
-  enable_omnistore: false
-model:
-  network:
-    hidden_size: 4096
-    n_embed: 4096
-    n_inner: 14336
-    n_head: 32
-    n_layer: 32
-    vocab_size: 155136
-    max_position_embeddings: 32768
-    cross_entropy_spilt_num: 1
-    layer_norm_epsilon: 1.0e-05
-    activation_function: gelu_new
-    resid_pdrop: 0.1
-    embd_pdrop: 0.0
-    attn_pdrop: 0.1
-    scale_attn_weights: true
-    scale_attn_by_inverse_layer_idx: false
-    reorder_and_upcast_attn: false
-    initializer_range: 0.009882118
-    gradient_checkpointing: false
-    gradient_checkpointing_ln: false
-    gradient_checkpointing_mlp: false
-    gradient_checkpointing_start_layers: 0
-    tie_weight: false
-    pad_idx: 1
-    use_ft_flash_attn: false
-    use_ft_linear: false
-    use_ft_layernorm: false
-    use_xperf_rotary: false
-    use_rmpad: true
-    fuse_gelu_gemm: false
-    pad_output: false
-    position_embeddings_type: rope
-    skip_n_iters: -1
-    n_shared_qhead: 4
-    num_q_heads: -1
-    num_kv_heads: -1
-    head_dim: -1
-    kv_mirror_layers: []
-    kv_mirror_imitated_layers: []
-    residual_post_ln_layers: []
-    hyperconnection_rate: -1
-    repeat_kv_heads: true
-    sparse_attention_window_size:
-    - -1
-    use_query_swiglu: false
-    query_swiglu_inner_dim: 8192
-    force_mem_efficient_layers:
-    - -1
-    noop_transformer_layers: []
-    dense_ffn_layers: []
-    dense_ffn_type: swiglu
-    dense_ffn_inner_dim: -1
-    moe_expert_type: exp-xelego
-    moe_gate_type: caplog-lego
-    moe_gate_metric_type: lego
-    moe_expert_exp_level: 4
-    moe_expert_exp_first_dim_factor: 1.0
-    moe_expert_exp_first_num: 2
-    moe_topk: 5
-    moe_num_expert: 0
-    moe_expert_eq_dim_factor: 0.25
-    moe_backend: default
-    moe_overlap_recomp_grad_comm: false
-    moe_expert_op_version: V1
-    moe_aux_loss_weight: 0.001
-    moe_gate_dropout: 0.0
-    moe_use_balance: false
-    moe_expert_group_capacity: 1.0
-    moe_expert_group_balance_loss_weight: 0.0
-    moe_expert_groups_in_ep_rank: 1
-    moe_enable_warmup: false
-    moe_swiglu_fc1_2_init_scale: 1.0
-    janus_use_big_op: false
-    janus_big_op_version: V1
-    janus_big_op_attn_grad_accum_fusion: true
-    convert_gate_to_fp32: false
-    moe_enable_ema_update: 1
-    query_head_scale_factor: 1
-    value_moe_num_expert: 0
-    value_moe_qkv_topk: 4
-    value_moe_qkv_times: 1
-    value_moe_is_repeat: true
-    value_moe_expert_type: linear-lego
-    moe_pr_scale_factor: 1.0
-    moe_pr_expert_type: disabled
-    value_moe_gate_type: default-lego
-    value_moe_gate_metric_type: default
-    lora_rank: 0
-    save_mixed_ckpt_in_shards: false
-    save_mixed_model_states_freq: final
-    cont_train_mode: default
-    fuse_lora_weight: true
-    rope_mode: default
-    rope_scale: 1
-    rope_base: 500000.0
-    rope_cut: false
-    rope_cut_head_dim: 0
-    rope_force_fp32: false
-    sparse_attention_window_scale: 1
-    sparse_attention_global_window_size:
-    - 0
-    use_attention_bias: false
-    layer_norm_type: rmsnorm_torch
-    exact_token_as_loss_denominator: false
-    use_key_layernorm: false
-    key_norm_after_rope: false
-    use_query_layernorm: false
-    use_context_groupnorm: false
-    use_mariana_gqa_pattern: false
-    use_sequence_parallel_attention: false
-    use_sequence_parallel_attention_a2a: false
-    context_parallel_use_all_gather: false
-    fp8_use_bf16_layers: ''
-    deterministic_mode: false
-    megatron_tensor_parallel_size: 8
-    megatron_pipeline_parallel_size: 1
-    megatron_context_parallel_size: 1
-    megatron_expert_parallel_size: 1
-    megatron_expert_parallel_size_in_dp: 1
-    megatron_context_parallel_query_only: false
-    megatron_num_layers_per_virtual_pipeline_stage: 0
-    megatron_micro_batch_size: 1
-    megatron_global_batch_size: 256
-    megatron_sequence_parallel: true
-    megatron_recompute_granularity: ''
-    megatron_use_flash_attention: true
-    megatron_recompute_method: uniform
-    megatron_recompute_num_layers: 1
-    megatron_distribute_saved_activations: false
-    megatron_enable_distributed_optimizer: true
-    megatron_use_multi_precision_ddp: false
-    megatron_sequence_parallel_as_data_parallel_in_optimizer: false
-    megatron_gather_params_use_alltoall: false
-    megatron_enable_initial_jit_warmup: true
-    megatron_accumulate_allreduce_grads_in_fp32: true
-    megatron_bf16_use_bf16_allreduce_grads: false
-    megatron_grad_comm_type: ''
-    megatron_reduce_grads_use_alltoall: false
-    megatron_scale_loss_in_gradient: false
-    megatron_scale_gradient_after_allreduce: false
-    megatron_ddp_impl: local
-    megatron_bf16_qt: false
-    megatron_empty_cache_level: 0
-    megatron_force_fp32_embed: false
-    megatron_deterministic_flash_attn: false
-    megatron_switch_pp_and_dp: false
-    megatron_timing_log_level: 2
-    megatron_no_load_rng: false
-    megatron_no_save_rng: false
-    megatron_no_load_optim: false
-    megatron_mem_efficient_column_parallel: true
-    megatron_masked_softmax_fusion: true
-    megatron_bias_gelu_fusion: false
-    megatron_bias_dropout_fusion: false
-    megatron_gradient_accumulation_fusion: true
-    megatron_overlap_p2p_comm: false
-    megatron_deallocate_pipeline_outputs: true
-    megatron_timing_log_option: local
-    megatron_barrier_with_L1_time: false
-    megatron_strict_align_diff_with_ds: false
-    megatron_parallel_linear_force_weight_contiguous: false
-    megatron_use_mariana_softmax: false
-    megatron_use_mariana_activation: false
-    megatron_overlap_data_parallel_communication: false
-    megatron_overlap_dp_grad_comm: false
-    megatron_overlap_dp_param_comm: false
-    megatron_early_prefetch_dp_allgather: true
-    megatron_use_non_sequential_block: false
-    megatron_overlap_attn_grad_input_comm: true
-    megatron_sequence_data_parallel_size: -1
-    megatron_distributed_sequence_parallel_size: -1
-    megatron_num_layers_for_pipeline_stages: []
-    megatron_vocab_parallel_embedding_fusion: false
-    megatron_embedding_reduce_scatter_for_sp: true
-    megatron_print_args: true
-    megatron_grad_norm_skip: -1.0
-    megatron_reorder_wgrad: false
-    megatron_offload_activations: false
-    megatron_offload_ratio: 1.0
-    megatron_offload_launch_ratio: 1.0
-    megatron_optimizer_offload_main_param: false
-    megatron_data_parallel_random_init: false
-    megatron_pipeline_strategy: ''
-    megatron_pipeline_wgrad_strategy: ''
-    megatron_pipeline_warmup_overlap: false
-    megatron_allow_transformer_engine: false
-    megatron_fp8_e4m3: false
-    megatron_fp8_hybrid: false
-    megatron_fp8_wgrad: true
-    megatron_fp8_dgrad: true
-    megatron_fp8_margin: 0
-    megatron_fp8_interval: 1
-    megatron_transformer_impl: local
-    megatron_fp8_amax_history_len: 1024
-    megatron_fp8_amax_compute_algo: max
-    megatron_use_qlora: false
-    megatron_qlora_quant_weight_dtype: null
-    megatron_qlora_quant_real_store: false
-    megatron_qlora_quant_groupsize: -1
-    megatron_qlora_quant_input_dtype: ''
-    megatron_qlora_quant_aware_lora: false
-    megatron_qlora_quant_aware_L4Q: false
-    megatron_terapipe_nano_batch_size: -1
-  lora_config:
-    default:
-      lora_dropout: 0.0
-      lora_rank: 64
-      layers:
-      - all
-      init_method: normal
-      init_mode: nonzero_parallel_init
-      init_kwargs: {}
-      lora_alpha: 2.0
-      use_rslora: true
-      lora_experts_appr: full
-      use_qlora: false
-      qlora_quant_weight_dtype: null
-      qlora_quant_real_store: false
-      qlora_quant_aware_L4Q: false
-      qlora_quant_groupsize: -1
-      qlora_quant_input_dtype: None
-      qlora_quant_aware_lora: false
-      post_training_quant: false
-      fully_sharded: false
-      emb_trainable: true
-    target_modules:
-    - query_key_value
-    - experts
-    - dense
-    query_key_value:
-      lora_rank: -1
-      lora_alpha: -1.0
-    experts:
-      lora_rank: -1
-      lora_alpha: -1.0
-    dense:
-      lora_rank: -1
-      lora_alpha: -1.0
-    dense_h_to_4h:
-      lora_rank: -1
-      lora_alpha: -1.0
-    dense_4h_to_h:
-      lora_rank: -1
-      lora_alpha: -1.0
-  freeze_prefix: null
-  partial_pretrain: null
-  partial_pretrain_rename: null
-  reset_global_step: -1
-  override_lr_scheduler: true
-  start_debug_server: false
-  clip_token_ids: false
-data:
-  train_path: hdfs://haruna/home/byte_data_seed/hdd_hldy/seed_code_seekpath/pretrained_yaml_new/V1_longct_datacard_hdfs_new_stage2_code_ct_fim_2.yaml
-  val_path:
-  - hdfs://haruna/home/byte_data_seed/ssd_hldy/p0/D73_release/val/D73_val_20240507_2_200M_token_plain_source_v2_1_part
-  - hdfs://haruna/home/byte_data_seed/ssd_hldy/p0/D73_release/val/human_all_lite
-  - hdfs://haruna/home/byte_data_seed/ssd_hldy/p0/D73_release/val/autoeval_code_val_lite
-  - hdfs://haruna/home/byte_data_seed/ssd_hldy/p0/D73_release/val/pretrain_auto_eval_merged_all_20240412_ceval_1_part
-  - hdfs://haruna/home/byte_data_seed/ssd_hldy/p0/D73_release/val/merged_few_benchmark_datasets_20240705_1_part
-  - hdfs://haruna/home/byte_data_seed/ssd_hldy/p0/D73_release/val/pretrain_auto_eval_merged_all_v0.3_1_part
-  - hdfs://haruna/home/byte_data_seed/ssd_hldy/p0/D73_release/val/D74_val_20240621_200M_token_tok643_sa8192_plain_source_v2_1_part_dir
-  train_size: 5000000000000
-  val_size: -1
-  train_batch_size: 32
-  train_num_workers: 4
-  val_batch_size: -1
-  val_num_workers: 1
-  max_seq_len: 32768
-  val_max_seq_len: -1
-  text_keys:
-  - content_split
-  tokenizer: hdfs://haruna/home/byte_data_seed/hl_lq/seed_code/liuyongfei/tokenizers/bbpe155k-v6.4.3-ml.pret
-  gpu_prefetch: false
-  cpu_prefetch: false
-  dyn_bsz: true
-  dyn_bsz_margin: 0.0
-  stride: -1
-  warmup_step_rate: -1.0
-  tokenizer_type: bbpe
-  bsz_warmup: false
-  bsz_warmup_rate: 0.016
-  return_source: true
-  synthetic_sample: false
-  synthetic_batch: false
-  seq_lens: null
-  seq_probs: null
-  enable_sampling_ratios: false
-  train_path_with_ratio: null
-  src_weights: null
-  parse_aug_data: false
-  loader_accumulate: -1
-  bsz_warmup_warmup_step_rate: 0.002
-  max_epochs: 1
-  pad_idx: 1
-  strategy: megatron
-  megatron_micro_batch_size: 1
-  use_rmpad: true
-  hidden_size: -1
-  megatron_sequence_parallel: false
-  max_position_embeddings: 2048
-  position_embeddings_type: absolute
-  use_sequence_parallel_attention: false
-  use_sequence_parallel_attention_a2a: false
-  resume_ckpt_path: ''
-  val_override_est_steps: false
-  init_without_cli: true
-  rope_mode: default
-  rope_scale: 1
-  rope_base: 500000.0
-  rope_cut: false
-  rope_cut_head_dim: 0
-  init_val_loader_worker_beforehand: false
-  megatron_global_batch_size: 1
-  megatron_tensor_parallel_size: 1
-  megatron_pipeline_parallel_size: 1
-  n_head: 1
-log_level: INFO
-val_only: false
-merge_model_states: false
-merge_ckpt_dtype: bf16
-merge_cache_dir: ./
-download_ckpt_in_shards: true
-gc_interval: 50
-profiler_at_iter: -1
-timer_at_iter: -1
-profile_all_ranks: false
-profile_ranks: []
-profile_every_n_steps: -1
-profiler_memory_at_iter: null
-profile_max_preview_rank: 0