yuyuzhang commited on
Commit
2608398
·
verified ·
1 Parent(s): e9d6499

Delete cruise_cli.yaml

Browse files
Files changed (1) hide show
  1. cruise_cli.yaml +0 -444
cruise_cli.yaml DELETED
@@ -1,444 +0,0 @@
1
- trainer:
2
- default_root_dir: null
3
- default_hdfs_dir: hdfs://haruna/home/byte_data_seed/hdd_hldy/user/yuyu.zhang/seekpath/P61_D6_8B_8M_tp2_stage2_H800_code
4
- logger:
5
- - tracking
6
- - console
7
- log_every_n_steps: 50
8
- benchmark: false
9
- enable_speedmonitor: true
10
- stats_speedmonitor: false
11
- enable_versions: false
12
- detect_anomaly: false
13
- deterministic: false
14
- accelerator: gpu
15
- accelerator_kwargs:
16
- mega_config: null
17
- precision: bf16
18
- max_epochs: 1
19
- max_steps: -1
20
- limit_train_batches: null
21
- limit_val_batches: null
22
- limit_test_batches: null
23
- static_sync_limit_val: false
24
- sync_batchnorm: false
25
- sync_fit_metrics: null
26
- val_check_interval:
27
- - 20000000
28
- save_before_val: false
29
- accumulate_grad_batches: null
30
- gradient_clip_val: 1.0
31
- max_grad_clip: 0.0
32
- seed: null
33
- summarize_model_depth: 0
34
- resume_ckpt_path: auto
35
- frozen_ckpt_path: null
36
- resume_strict: true
37
- resume_optimizer: true
38
- resume_metadata: true
39
- resume_loader_state: false
40
- callbacks: null
41
- enable_checkpoint:
42
- - 1
43
- - 10000
44
- checkpoint_monitor: step
45
- checkpoint_mode: max
46
- dataloader_timeout: -1
47
- dataloader_retry_limit: 100
48
- dataloader_retry_persistent_limit: 5
49
- find_unused_parameters: false
50
- project_name: seekpath_v3
51
- experiment_name: P61_D6_8B_npu_8M_tp2_H800stage1_code
52
- enable_trace: false
53
- reload_dataloaders_every_n_epochs: -1
54
- strategy: megatron
55
- enable_qat: false
56
- no_quant_module: []
57
- enable_ptq: true
58
- qat_kwargs: {}
59
- optimizer_kwargs:
60
- optimizer:
61
- type: adam
62
- params:
63
- lr: 3.0e-05
64
- betas:
65
- - 0.9
66
- - 0.95
67
- eps: 1.0e-08
68
- weight_decay: 0.1
69
- bias_correction: true
70
- adam_w_mode: true
71
- momentum: 0.9
72
- lr_mult_keys: []
73
- no_weight_decay_keys: []
74
- weight_decay_keys: []
75
- lr_mult_start_epoch: 0
76
- lr_mult: 1.0
77
- scheduler:
78
- type: megatron.optimizer_param_schedule.OptimizerParamScheduler
79
- total_steps_param_name: num_training_steps
80
- warmup_steps_param_name: num_warmup_steps
81
- interval: step
82
- params:
83
- warmup_step_rate: 0.002
84
- lr_end: 0.1
85
- lr_decay_style: constant
86
- lr_decay_rate: 1.0
87
- grad_norm_layers: []
88
- checkpoint_kwargs:
89
- verbose: false
90
- save_last: false
91
- save_weights_only: false
92
- every_n_train_steps: -1
93
- every_n_seconds: -1
94
- save_best: false
95
- storage:
96
- enable_shm_download: false
97
- enable_shm_upload: false
98
- download_thread_num: 16
99
- upload_thread_num: 1
100
- enable_save_checkpoint_async: true
101
- enable_profiler: false
102
- profiler_schedule_kwargs:
103
- wait: 50
104
- warmup: 3
105
- active: 3
106
- repeat: 1
107
- profile_all_ranks: false
108
- enable_bsdp: false
109
- bsdp_num_prefetch: 64
110
- keep_frozen_weights: true
111
- val_reduce_fn: {}
112
- experiment_id: null
113
- enable_omnistore: false
114
- model:
115
- network:
116
- hidden_size: 4096
117
- n_embed: 4096
118
- n_inner: 14336
119
- n_head: 32
120
- n_layer: 32
121
- vocab_size: 155136
122
- max_position_embeddings: 32768
123
- cross_entropy_spilt_num: 1
124
- layer_norm_epsilon: 1.0e-05
125
- activation_function: gelu_new
126
- resid_pdrop: 0.1
127
- embd_pdrop: 0.0
128
- attn_pdrop: 0.1
129
- scale_attn_weights: true
130
- scale_attn_by_inverse_layer_idx: false
131
- reorder_and_upcast_attn: false
132
- initializer_range: 0.009882118
133
- gradient_checkpointing: false
134
- gradient_checkpointing_ln: false
135
- gradient_checkpointing_mlp: false
136
- gradient_checkpointing_start_layers: 0
137
- tie_weight: false
138
- pad_idx: 1
139
- use_ft_flash_attn: false
140
- use_ft_linear: false
141
- use_ft_layernorm: false
142
- use_xperf_rotary: false
143
- use_rmpad: true
144
- fuse_gelu_gemm: false
145
- pad_output: false
146
- position_embeddings_type: rope
147
- skip_n_iters: -1
148
- n_shared_qhead: 4
149
- num_q_heads: -1
150
- num_kv_heads: -1
151
- head_dim: -1
152
- kv_mirror_layers: []
153
- kv_mirror_imitated_layers: []
154
- residual_post_ln_layers: []
155
- hyperconnection_rate: -1
156
- repeat_kv_heads: true
157
- sparse_attention_window_size:
158
- - -1
159
- use_query_swiglu: false
160
- query_swiglu_inner_dim: 8192
161
- force_mem_efficient_layers:
162
- - -1
163
- noop_transformer_layers: []
164
- dense_ffn_layers: []
165
- dense_ffn_type: swiglu
166
- dense_ffn_inner_dim: -1
167
- moe_expert_type: exp-xelego
168
- moe_gate_type: caplog-lego
169
- moe_gate_metric_type: lego
170
- moe_expert_exp_level: 4
171
- moe_expert_exp_first_dim_factor: 1.0
172
- moe_expert_exp_first_num: 2
173
- moe_topk: 5
174
- moe_num_expert: 0
175
- moe_expert_eq_dim_factor: 0.25
176
- moe_backend: default
177
- moe_overlap_recomp_grad_comm: false
178
- moe_expert_op_version: V1
179
- moe_aux_loss_weight: 0.001
180
- moe_gate_dropout: 0.0
181
- moe_use_balance: false
182
- moe_expert_group_capacity: 1.0
183
- moe_expert_group_balance_loss_weight: 0.0
184
- moe_expert_groups_in_ep_rank: 1
185
- moe_enable_warmup: false
186
- moe_swiglu_fc1_2_init_scale: 1.0
187
- janus_use_big_op: false
188
- janus_big_op_version: V1
189
- janus_big_op_attn_grad_accum_fusion: true
190
- convert_gate_to_fp32: false
191
- moe_enable_ema_update: 1
192
- query_head_scale_factor: 1
193
- value_moe_num_expert: 0
194
- value_moe_qkv_topk: 4
195
- value_moe_qkv_times: 1
196
- value_moe_is_repeat: true
197
- value_moe_expert_type: linear-lego
198
- moe_pr_scale_factor: 1.0
199
- moe_pr_expert_type: disabled
200
- value_moe_gate_type: default-lego
201
- value_moe_gate_metric_type: default
202
- lora_rank: 0
203
- save_mixed_ckpt_in_shards: false
204
- save_mixed_model_states_freq: final
205
- cont_train_mode: default
206
- fuse_lora_weight: true
207
- rope_mode: default
208
- rope_scale: 1
209
- rope_base: 500000.0
210
- rope_cut: false
211
- rope_cut_head_dim: 0
212
- rope_force_fp32: false
213
- sparse_attention_window_scale: 1
214
- sparse_attention_global_window_size:
215
- - 0
216
- use_attention_bias: false
217
- layer_norm_type: rmsnorm_torch
218
- exact_token_as_loss_denominator: false
219
- use_key_layernorm: false
220
- key_norm_after_rope: false
221
- use_query_layernorm: false
222
- use_context_groupnorm: false
223
- use_mariana_gqa_pattern: false
224
- use_sequence_parallel_attention: false
225
- use_sequence_parallel_attention_a2a: false
226
- context_parallel_use_all_gather: false
227
- fp8_use_bf16_layers: ''
228
- deterministic_mode: false
229
- megatron_tensor_parallel_size: 8
230
- megatron_pipeline_parallel_size: 1
231
- megatron_context_parallel_size: 1
232
- megatron_expert_parallel_size: 1
233
- megatron_expert_parallel_size_in_dp: 1
234
- megatron_context_parallel_query_only: false
235
- megatron_num_layers_per_virtual_pipeline_stage: 0
236
- megatron_micro_batch_size: 1
237
- megatron_global_batch_size: 256
238
- megatron_sequence_parallel: true
239
- megatron_recompute_granularity: ''
240
- megatron_use_flash_attention: true
241
- megatron_recompute_method: uniform
242
- megatron_recompute_num_layers: 1
243
- megatron_distribute_saved_activations: false
244
- megatron_enable_distributed_optimizer: true
245
- megatron_use_multi_precision_ddp: false
246
- megatron_sequence_parallel_as_data_parallel_in_optimizer: false
247
- megatron_gather_params_use_alltoall: false
248
- megatron_enable_initial_jit_warmup: true
249
- megatron_accumulate_allreduce_grads_in_fp32: true
250
- megatron_bf16_use_bf16_allreduce_grads: false
251
- megatron_grad_comm_type: ''
252
- megatron_reduce_grads_use_alltoall: false
253
- megatron_scale_loss_in_gradient: false
254
- megatron_scale_gradient_after_allreduce: false
255
- megatron_ddp_impl: local
256
- megatron_bf16_qt: false
257
- megatron_empty_cache_level: 0
258
- megatron_force_fp32_embed: false
259
- megatron_deterministic_flash_attn: false
260
- megatron_switch_pp_and_dp: false
261
- megatron_timing_log_level: 2
262
- megatron_no_load_rng: false
263
- megatron_no_save_rng: false
264
- megatron_no_load_optim: false
265
- megatron_mem_efficient_column_parallel: true
266
- megatron_masked_softmax_fusion: true
267
- megatron_bias_gelu_fusion: false
268
- megatron_bias_dropout_fusion: false
269
- megatron_gradient_accumulation_fusion: true
270
- megatron_overlap_p2p_comm: false
271
- megatron_deallocate_pipeline_outputs: true
272
- megatron_timing_log_option: local
273
- megatron_barrier_with_L1_time: false
274
- megatron_strict_align_diff_with_ds: false
275
- megatron_parallel_linear_force_weight_contiguous: false
276
- megatron_use_mariana_softmax: false
277
- megatron_use_mariana_activation: false
278
- megatron_overlap_data_parallel_communication: false
279
- megatron_overlap_dp_grad_comm: false
280
- megatron_overlap_dp_param_comm: false
281
- megatron_early_prefetch_dp_allgather: true
282
- megatron_use_non_sequential_block: false
283
- megatron_overlap_attn_grad_input_comm: true
284
- megatron_sequence_data_parallel_size: -1
285
- megatron_distributed_sequence_parallel_size: -1
286
- megatron_num_layers_for_pipeline_stages: []
287
- megatron_vocab_parallel_embedding_fusion: false
288
- megatron_embedding_reduce_scatter_for_sp: true
289
- megatron_print_args: true
290
- megatron_grad_norm_skip: -1.0
291
- megatron_reorder_wgrad: false
292
- megatron_offload_activations: false
293
- megatron_offload_ratio: 1.0
294
- megatron_offload_launch_ratio: 1.0
295
- megatron_optimizer_offload_main_param: false
296
- megatron_data_parallel_random_init: false
297
- megatron_pipeline_strategy: ''
298
- megatron_pipeline_wgrad_strategy: ''
299
- megatron_pipeline_warmup_overlap: false
300
- megatron_allow_transformer_engine: false
301
- megatron_fp8_e4m3: false
302
- megatron_fp8_hybrid: false
303
- megatron_fp8_wgrad: true
304
- megatron_fp8_dgrad: true
305
- megatron_fp8_margin: 0
306
- megatron_fp8_interval: 1
307
- megatron_transformer_impl: local
308
- megatron_fp8_amax_history_len: 1024
309
- megatron_fp8_amax_compute_algo: max
310
- megatron_use_qlora: false
311
- megatron_qlora_quant_weight_dtype: null
312
- megatron_qlora_quant_real_store: false
313
- megatron_qlora_quant_groupsize: -1
314
- megatron_qlora_quant_input_dtype: ''
315
- megatron_qlora_quant_aware_lora: false
316
- megatron_qlora_quant_aware_L4Q: false
317
- megatron_terapipe_nano_batch_size: -1
318
- lora_config:
319
- default:
320
- lora_dropout: 0.0
321
- lora_rank: 64
322
- layers:
323
- - all
324
- init_method: normal
325
- init_mode: nonzero_parallel_init
326
- init_kwargs: {}
327
- lora_alpha: 2.0
328
- use_rslora: true
329
- lora_experts_appr: full
330
- use_qlora: false
331
- qlora_quant_weight_dtype: null
332
- qlora_quant_real_store: false
333
- qlora_quant_aware_L4Q: false
334
- qlora_quant_groupsize: -1
335
- qlora_quant_input_dtype: None
336
- qlora_quant_aware_lora: false
337
- post_training_quant: false
338
- fully_sharded: false
339
- emb_trainable: true
340
- target_modules:
341
- - query_key_value
342
- - experts
343
- - dense
344
- query_key_value:
345
- lora_rank: -1
346
- lora_alpha: -1.0
347
- experts:
348
- lora_rank: -1
349
- lora_alpha: -1.0
350
- dense:
351
- lora_rank: -1
352
- lora_alpha: -1.0
353
- dense_h_to_4h:
354
- lora_rank: -1
355
- lora_alpha: -1.0
356
- dense_4h_to_h:
357
- lora_rank: -1
358
- lora_alpha: -1.0
359
- freeze_prefix: null
360
- partial_pretrain: null
361
- partial_pretrain_rename: null
362
- reset_global_step: -1
363
- override_lr_scheduler: true
364
- start_debug_server: false
365
- clip_token_ids: false
366
- data:
367
- train_path: hdfs://haruna/home/byte_data_seed/hdd_hldy/seed_code_seekpath/pretrained_yaml_new/V1_longct_datacard_hdfs_new_stage2_code_ct_fim_2.yaml
368
- val_path:
369
- - hdfs://haruna/home/byte_data_seed/ssd_hldy/p0/D73_release/val/D73_val_20240507_2_200M_token_plain_source_v2_1_part
370
- - hdfs://haruna/home/byte_data_seed/ssd_hldy/p0/D73_release/val/human_all_lite
371
- - hdfs://haruna/home/byte_data_seed/ssd_hldy/p0/D73_release/val/autoeval_code_val_lite
372
- - hdfs://haruna/home/byte_data_seed/ssd_hldy/p0/D73_release/val/pretrain_auto_eval_merged_all_20240412_ceval_1_part
373
- - hdfs://haruna/home/byte_data_seed/ssd_hldy/p0/D73_release/val/merged_few_benchmark_datasets_20240705_1_part
374
- - hdfs://haruna/home/byte_data_seed/ssd_hldy/p0/D73_release/val/pretrain_auto_eval_merged_all_v0.3_1_part
375
- - hdfs://haruna/home/byte_data_seed/ssd_hldy/p0/D73_release/val/D74_val_20240621_200M_token_tok643_sa8192_plain_source_v2_1_part_dir
376
- train_size: 5000000000000
377
- val_size: -1
378
- train_batch_size: 32
379
- train_num_workers: 4
380
- val_batch_size: -1
381
- val_num_workers: 1
382
- max_seq_len: 32768
383
- val_max_seq_len: -1
384
- text_keys:
385
- - content_split
386
- tokenizer: hdfs://haruna/home/byte_data_seed/hl_lq/seed_code/liuyongfei/tokenizers/bbpe155k-v6.4.3-ml.pret
387
- gpu_prefetch: false
388
- cpu_prefetch: false
389
- dyn_bsz: true
390
- dyn_bsz_margin: 0.0
391
- stride: -1
392
- warmup_step_rate: -1.0
393
- tokenizer_type: bbpe
394
- bsz_warmup: false
395
- bsz_warmup_rate: 0.016
396
- return_source: true
397
- synthetic_sample: false
398
- synthetic_batch: false
399
- seq_lens: null
400
- seq_probs: null
401
- enable_sampling_ratios: false
402
- train_path_with_ratio: null
403
- src_weights: null
404
- parse_aug_data: false
405
- loader_accumulate: -1
406
- bsz_warmup_warmup_step_rate: 0.002
407
- max_epochs: 1
408
- pad_idx: 1
409
- strategy: megatron
410
- megatron_micro_batch_size: 1
411
- use_rmpad: true
412
- hidden_size: -1
413
- megatron_sequence_parallel: false
414
- max_position_embeddings: 2048
415
- position_embeddings_type: absolute
416
- use_sequence_parallel_attention: false
417
- use_sequence_parallel_attention_a2a: false
418
- resume_ckpt_path: ''
419
- val_override_est_steps: false
420
- init_without_cli: true
421
- rope_mode: default
422
- rope_scale: 1
423
- rope_base: 500000.0
424
- rope_cut: false
425
- rope_cut_head_dim: 0
426
- init_val_loader_worker_beforehand: false
427
- megatron_global_batch_size: 1
428
- megatron_tensor_parallel_size: 1
429
- megatron_pipeline_parallel_size: 1
430
- n_head: 1
431
- log_level: INFO
432
- val_only: false
433
- merge_model_states: false
434
- merge_ckpt_dtype: bf16
435
- merge_cache_dir: ./
436
- download_ckpt_in_shards: true
437
- gc_interval: 50
438
- profiler_at_iter: -1
439
- timer_at_iter: -1
440
- profile_all_ranks: false
441
- profile_ranks: []
442
- profile_every_n_steps: -1
443
- profiler_memory_at_iter: null
444
- profile_max_preview_rank: 0