Seed-Coder-8B-Base / cruise_cli.yaml
yuyuzhang's picture
Upload folder using huggingface_hub
083ad84 verified
raw
history blame
13.6 kB
trainer:
default_root_dir: null
default_hdfs_dir: hdfs://haruna/home/byte_data_seed/hdd_hldy/user/yuyu.zhang/seekpath/P61_D6_8B_8M_tp2_stage2_H800_code
logger:
- tracking
- console
log_every_n_steps: 50
benchmark: false
enable_speedmonitor: true
stats_speedmonitor: false
enable_versions: false
detect_anomaly: false
deterministic: false
accelerator: gpu
accelerator_kwargs:
mega_config: null
precision: bf16
max_epochs: 1
max_steps: -1
limit_train_batches: null
limit_val_batches: null
limit_test_batches: null
static_sync_limit_val: false
sync_batchnorm: false
sync_fit_metrics: null
val_check_interval:
- 20000000
save_before_val: false
accumulate_grad_batches: null
gradient_clip_val: 1.0
max_grad_clip: 0.0
seed: null
summarize_model_depth: 0
resume_ckpt_path: auto
frozen_ckpt_path: null
resume_strict: true
resume_optimizer: true
resume_metadata: true
resume_loader_state: false
callbacks: null
enable_checkpoint:
- 1
- 10000
checkpoint_monitor: step
checkpoint_mode: max
dataloader_timeout: -1
dataloader_retry_limit: 100
dataloader_retry_persistent_limit: 5
find_unused_parameters: false
project_name: seekpath_v3
experiment_name: P61_D6_8B_npu_8M_tp2_H800stage1_code
enable_trace: false
reload_dataloaders_every_n_epochs: -1
strategy: megatron
enable_qat: false
no_quant_module: []
enable_ptq: true
qat_kwargs: {}
optimizer_kwargs:
optimizer:
type: adam
params:
lr: 3.0e-05
betas:
- 0.9
- 0.95
eps: 1.0e-08
weight_decay: 0.1
bias_correction: true
adam_w_mode: true
momentum: 0.9
lr_mult_keys: []
no_weight_decay_keys: []
weight_decay_keys: []
lr_mult_start_epoch: 0
lr_mult: 1.0
scheduler:
type: megatron.optimizer_param_schedule.OptimizerParamScheduler
total_steps_param_name: num_training_steps
warmup_steps_param_name: num_warmup_steps
interval: step
params:
warmup_step_rate: 0.002
lr_end: 0.1
lr_decay_style: constant
lr_decay_rate: 1.0
grad_norm_layers: []
checkpoint_kwargs:
verbose: false
save_last: false
save_weights_only: false
every_n_train_steps: -1
every_n_seconds: -1
save_best: false
storage:
enable_shm_download: false
enable_shm_upload: false
download_thread_num: 16
upload_thread_num: 1
enable_save_checkpoint_async: true
enable_profiler: false
profiler_schedule_kwargs:
wait: 50
warmup: 3
active: 3
repeat: 1
profile_all_ranks: false
enable_bsdp: false
bsdp_num_prefetch: 64
keep_frozen_weights: true
val_reduce_fn: {}
experiment_id: null
enable_omnistore: false
model:
network:
hidden_size: 4096
n_embed: 4096
n_inner: 14336
n_head: 32
n_layer: 32
vocab_size: 155136
max_position_embeddings: 32768
cross_entropy_spilt_num: 1
layer_norm_epsilon: 1.0e-05
activation_function: gelu_new
resid_pdrop: 0.1
embd_pdrop: 0.0
attn_pdrop: 0.1
scale_attn_weights: true
scale_attn_by_inverse_layer_idx: false
reorder_and_upcast_attn: false
initializer_range: 0.009882118
gradient_checkpointing: false
gradient_checkpointing_ln: false
gradient_checkpointing_mlp: false
gradient_checkpointing_start_layers: 0
tie_weight: false
pad_idx: 1
use_ft_flash_attn: false
use_ft_linear: false
use_ft_layernorm: false
use_xperf_rotary: false
use_rmpad: true
fuse_gelu_gemm: false
pad_output: false
position_embeddings_type: rope
skip_n_iters: -1
n_shared_qhead: 4
num_q_heads: -1
num_kv_heads: -1
head_dim: -1
kv_mirror_layers: []
kv_mirror_imitated_layers: []
residual_post_ln_layers: []
hyperconnection_rate: -1
repeat_kv_heads: true
sparse_attention_window_size:
- -1
use_query_swiglu: false
query_swiglu_inner_dim: 8192
force_mem_efficient_layers:
- -1
noop_transformer_layers: []
dense_ffn_layers: []
dense_ffn_type: swiglu
dense_ffn_inner_dim: -1
moe_expert_type: exp-xelego
moe_gate_type: caplog-lego
moe_gate_metric_type: lego
moe_expert_exp_level: 4
moe_expert_exp_first_dim_factor: 1.0
moe_expert_exp_first_num: 2
moe_topk: 5
moe_num_expert: 0
moe_expert_eq_dim_factor: 0.25
moe_backend: default
moe_overlap_recomp_grad_comm: false
moe_expert_op_version: V1
moe_aux_loss_weight: 0.001
moe_gate_dropout: 0.0
moe_use_balance: false
moe_expert_group_capacity: 1.0
moe_expert_group_balance_loss_weight: 0.0
moe_expert_groups_in_ep_rank: 1
moe_enable_warmup: false
moe_swiglu_fc1_2_init_scale: 1.0
janus_use_big_op: false
janus_big_op_version: V1
janus_big_op_attn_grad_accum_fusion: true
convert_gate_to_fp32: false
moe_enable_ema_update: 1
query_head_scale_factor: 1
value_moe_num_expert: 0
value_moe_qkv_topk: 4
value_moe_qkv_times: 1
value_moe_is_repeat: true
value_moe_expert_type: linear-lego
moe_pr_scale_factor: 1.0
moe_pr_expert_type: disabled
value_moe_gate_type: default-lego
value_moe_gate_metric_type: default
lora_rank: 0
save_mixed_ckpt_in_shards: false
save_mixed_model_states_freq: final
cont_train_mode: default
fuse_lora_weight: true
rope_mode: default
rope_scale: 1
rope_base: 500000.0
rope_cut: false
rope_cut_head_dim: 0
rope_force_fp32: false
sparse_attention_window_scale: 1
sparse_attention_global_window_size:
- 0
use_attention_bias: false
layer_norm_type: rmsnorm_torch
exact_token_as_loss_denominator: false
use_key_layernorm: false
key_norm_after_rope: false
use_query_layernorm: false
use_context_groupnorm: false
use_mariana_gqa_pattern: false
use_sequence_parallel_attention: false
use_sequence_parallel_attention_a2a: false
context_parallel_use_all_gather: false
fp8_use_bf16_layers: ''
deterministic_mode: false
megatron_tensor_parallel_size: 8
megatron_pipeline_parallel_size: 1
megatron_context_parallel_size: 1
megatron_expert_parallel_size: 1
megatron_expert_parallel_size_in_dp: 1
megatron_context_parallel_query_only: false
megatron_num_layers_per_virtual_pipeline_stage: 0
megatron_micro_batch_size: 1
megatron_global_batch_size: 256
megatron_sequence_parallel: true
megatron_recompute_granularity: ''
megatron_use_flash_attention: true
megatron_recompute_method: uniform
megatron_recompute_num_layers: 1
megatron_distribute_saved_activations: false
megatron_enable_distributed_optimizer: true
megatron_use_multi_precision_ddp: false
megatron_sequence_parallel_as_data_parallel_in_optimizer: false
megatron_gather_params_use_alltoall: false
megatron_enable_initial_jit_warmup: true
megatron_accumulate_allreduce_grads_in_fp32: true
megatron_bf16_use_bf16_allreduce_grads: false
megatron_grad_comm_type: ''
megatron_reduce_grads_use_alltoall: false
megatron_scale_loss_in_gradient: false
megatron_scale_gradient_after_allreduce: false
megatron_ddp_impl: local
megatron_bf16_qt: false
megatron_empty_cache_level: 0
megatron_force_fp32_embed: false
megatron_deterministic_flash_attn: false
megatron_switch_pp_and_dp: false
megatron_timing_log_level: 2
megatron_no_load_rng: false
megatron_no_save_rng: false
megatron_no_load_optim: false
megatron_mem_efficient_column_parallel: true
megatron_masked_softmax_fusion: true
megatron_bias_gelu_fusion: false
megatron_bias_dropout_fusion: false
megatron_gradient_accumulation_fusion: true
megatron_overlap_p2p_comm: false
megatron_deallocate_pipeline_outputs: true
megatron_timing_log_option: local
megatron_barrier_with_L1_time: false
megatron_strict_align_diff_with_ds: false
megatron_parallel_linear_force_weight_contiguous: false
megatron_use_mariana_softmax: false
megatron_use_mariana_activation: false
megatron_overlap_data_parallel_communication: false
megatron_overlap_dp_grad_comm: false
megatron_overlap_dp_param_comm: false
megatron_early_prefetch_dp_allgather: true
megatron_use_non_sequential_block: false
megatron_overlap_attn_grad_input_comm: true
megatron_sequence_data_parallel_size: -1
megatron_distributed_sequence_parallel_size: -1
megatron_num_layers_for_pipeline_stages: []
megatron_vocab_parallel_embedding_fusion: false
megatron_embedding_reduce_scatter_for_sp: true
megatron_print_args: true
megatron_grad_norm_skip: -1.0
megatron_reorder_wgrad: false
megatron_offload_activations: false
megatron_offload_ratio: 1.0
megatron_offload_launch_ratio: 1.0
megatron_optimizer_offload_main_param: false
megatron_data_parallel_random_init: false
megatron_pipeline_strategy: ''
megatron_pipeline_wgrad_strategy: ''
megatron_pipeline_warmup_overlap: false
megatron_allow_transformer_engine: false
megatron_fp8_e4m3: false
megatron_fp8_hybrid: false
megatron_fp8_wgrad: true
megatron_fp8_dgrad: true
megatron_fp8_margin: 0
megatron_fp8_interval: 1
megatron_transformer_impl: local
megatron_fp8_amax_history_len: 1024
megatron_fp8_amax_compute_algo: max
megatron_use_qlora: false
megatron_qlora_quant_weight_dtype: null
megatron_qlora_quant_real_store: false
megatron_qlora_quant_groupsize: -1
megatron_qlora_quant_input_dtype: ''
megatron_qlora_quant_aware_lora: false
megatron_qlora_quant_aware_L4Q: false
megatron_terapipe_nano_batch_size: -1
lora_config:
default:
lora_dropout: 0.0
lora_rank: 64
layers:
- all
init_method: normal
init_mode: nonzero_parallel_init
init_kwargs: {}
lora_alpha: 2.0
use_rslora: true
lora_experts_appr: full
use_qlora: false
qlora_quant_weight_dtype: null
qlora_quant_real_store: false
qlora_quant_aware_L4Q: false
qlora_quant_groupsize: -1
qlora_quant_input_dtype: None
qlora_quant_aware_lora: false
post_training_quant: false
fully_sharded: false
emb_trainable: true
target_modules:
- query_key_value
- experts
- dense
query_key_value:
lora_rank: -1
lora_alpha: -1.0
experts:
lora_rank: -1
lora_alpha: -1.0
dense:
lora_rank: -1
lora_alpha: -1.0
dense_h_to_4h:
lora_rank: -1
lora_alpha: -1.0
dense_4h_to_h:
lora_rank: -1
lora_alpha: -1.0
freeze_prefix: null
partial_pretrain: null
partial_pretrain_rename: null
reset_global_step: -1
override_lr_scheduler: true
start_debug_server: false
clip_token_ids: false
data:
train_path: hdfs://haruna/home/byte_data_seed/hdd_hldy/seed_code_seekpath/pretrained_yaml_new/V1_longct_datacard_hdfs_new_stage2_code_ct_fim_2.yaml
val_path:
- hdfs://haruna/home/byte_data_seed/ssd_hldy/p0/D73_release/val/D73_val_20240507_2_200M_token_plain_source_v2_1_part
- hdfs://haruna/home/byte_data_seed/ssd_hldy/p0/D73_release/val/human_all_lite
- hdfs://haruna/home/byte_data_seed/ssd_hldy/p0/D73_release/val/autoeval_code_val_lite
- hdfs://haruna/home/byte_data_seed/ssd_hldy/p0/D73_release/val/pretrain_auto_eval_merged_all_20240412_ceval_1_part
- hdfs://haruna/home/byte_data_seed/ssd_hldy/p0/D73_release/val/merged_few_benchmark_datasets_20240705_1_part
- hdfs://haruna/home/byte_data_seed/ssd_hldy/p0/D73_release/val/pretrain_auto_eval_merged_all_v0.3_1_part
- hdfs://haruna/home/byte_data_seed/ssd_hldy/p0/D73_release/val/D74_val_20240621_200M_token_tok643_sa8192_plain_source_v2_1_part_dir
train_size: 5000000000000
val_size: -1
train_batch_size: 32
train_num_workers: 4
val_batch_size: -1
val_num_workers: 1
max_seq_len: 32768
val_max_seq_len: -1
text_keys:
- content_split
tokenizer: hdfs://haruna/home/byte_data_seed/hl_lq/seed_code/liuyongfei/tokenizers/bbpe155k-v6.4.3-ml.pret
gpu_prefetch: false
cpu_prefetch: false
dyn_bsz: true
dyn_bsz_margin: 0.0
stride: -1
warmup_step_rate: -1.0
tokenizer_type: bbpe
bsz_warmup: false
bsz_warmup_rate: 0.016
return_source: true
synthetic_sample: false
synthetic_batch: false
seq_lens: null
seq_probs: null
enable_sampling_ratios: false
train_path_with_ratio: null
src_weights: null
parse_aug_data: false
loader_accumulate: -1
bsz_warmup_warmup_step_rate: 0.002
max_epochs: 1
pad_idx: 1
strategy: megatron
megatron_micro_batch_size: 1
use_rmpad: true
hidden_size: -1
megatron_sequence_parallel: false
max_position_embeddings: 2048
position_embeddings_type: absolute
use_sequence_parallel_attention: false
use_sequence_parallel_attention_a2a: false
resume_ckpt_path: ''
val_override_est_steps: false
init_without_cli: true
rope_mode: default
rope_scale: 1
rope_base: 500000.0
rope_cut: false
rope_cut_head_dim: 0
init_val_loader_worker_beforehand: false
megatron_global_batch_size: 1
megatron_tensor_parallel_size: 1
megatron_pipeline_parallel_size: 1
n_head: 1
log_level: INFO
val_only: false
merge_model_states: false
merge_ckpt_dtype: bf16
merge_cache_dir: ./
download_ckpt_in_shards: true
gc_interval: 50
profiler_at_iter: -1
timer_at_iter: -1
profile_all_ranks: false
profile_ranks: []
profile_every_n_steps: -1
profiler_memory_at_iter: null
profile_max_preview_rank: 0